diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..6a46808b046c7c6366da2e7943a8def04e4acd55 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,19 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-2000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-2500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-3000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-3500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-4000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-4500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-5000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-5500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-6000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-6500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-7000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-7500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-7584/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/checkpoint-1000/README.md b/checkpoint-1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e738b5eda9883f4a45efd9d37b8b8357ba758456 --- /dev/null +++ b/checkpoint-1000/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/Llama-3.2-3B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-1000/adapter_config.json b/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..083327e0c9f3dd1a9eda2bc8b558f75005b93594 --- /dev/null +++ b/checkpoint-1000/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/Llama-3.2-3B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "q_proj", + "v_proj", + "k_proj", + "down_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1000/adapter_model.safetensors b/checkpoint-1000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..80b553b63c77ee05772b187909e3049659642226 --- /dev/null +++ b/checkpoint-1000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40b6cf1a5705aab5c8565e7ed0b7ff124fa86e863ec0bac75311fa1ad692c001 +size 48680136 diff --git a/checkpoint-1000/optimizer.pt b/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d029a283fc1941b1b09048c0911b8a40670c419e --- /dev/null +++ b/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9ec21c3d3cf2cc7cb3c2d48e6420cfa8f03d507f097ec80482cdb71deedae12 +size 49846644 diff --git a/checkpoint-1000/rng_state.pth b/checkpoint-1000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e218b1d3f00cd4743f46779a9f88c797d567564a --- /dev/null +++ b/checkpoint-1000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38ca74abb9af1cc9151b91103870122159c1f0b4cbbe035d58feaf102cb270d5 +size 14244 diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b611097fc94b30a80793f7a47ba199f11519f974 --- /dev/null +++ b/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5cb34dde9a5646c673426da1d510aa200cd59945601d8c338cc6b922b739cf7 +size 1064 diff --git a/checkpoint-1000/special_tokens_map.json b/checkpoint-1000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-1000/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1000/tokenizer.json b/checkpoint-1000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-1000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-1000/tokenizer_config.json b/checkpoint-1000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b505f7771010e0b872d94ddb23916f0e9ebe5d6f --- /dev/null +++ b/checkpoint-1000/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 July 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": null +} diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c992ea2952a36135f0ae7d6e0c3dfee0a308b24f --- /dev/null +++ b/checkpoint-1000/trainer_state.json @@ -0,0 +1,1451 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2636609320413948, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001318304660206974, + "grad_norm": 4.59375, + "learning_rate": 0.0002, + "loss": 1.9624, + "step": 5 + }, + { + "epoch": 0.002636609320413948, + "grad_norm": 1.7421875, + "learning_rate": 0.00019986805647183008, + "loss": 0.6513, + "step": 10 + }, + { + "epoch": 0.003954913980620921, + "grad_norm": 1.84375, + "learning_rate": 0.00019973611294366012, + "loss": 0.1146, + "step": 15 + }, + { + "epoch": 0.005273218640827896, + "grad_norm": 1.3203125, + "learning_rate": 0.0001996041694154902, + "loss": 0.0529, + "step": 20 + }, + { + "epoch": 0.006591523301034869, + "grad_norm": 0.40234375, + "learning_rate": 0.00019947222588732023, + "loss": 0.1214, + "step": 25 + }, + { + "epoch": 0.007909827961241843, + "grad_norm": 1.5390625, + "learning_rate": 0.0001993402823591503, + "loss": 0.0919, + "step": 30 + }, + { + "epoch": 0.009228132621448816, + "grad_norm": 0.06201171875, + "learning_rate": 0.00019920833883098034, + "loss": 0.09, + "step": 35 + }, + { + "epoch": 0.010546437281655791, + "grad_norm": 1.53125, + "learning_rate": 0.0001990763953028104, + "loss": 0.1945, + "step": 40 + }, + { + "epoch": 0.011864741941862765, + "grad_norm": 0.2890625, + "learning_rate": 0.00019894445177464048, + "loss": 0.1259, + "step": 45 + }, + { + "epoch": 0.013183046602069738, + "grad_norm": 0.609375, + "learning_rate": 0.00019881250824647052, + "loss": 0.027, + "step": 50 + }, + { + "epoch": 0.014501351262276712, + "grad_norm": 0.369140625, + "learning_rate": 0.00019868056471830057, + "loss": 0.1068, + "step": 55 + }, + { + "epoch": 0.015819655922483685, + "grad_norm": 0.34765625, + "learning_rate": 0.00019854862119013064, + "loss": 0.0542, + "step": 60 + }, + { + "epoch": 0.01713796058269066, + "grad_norm": 0.055419921875, + "learning_rate": 0.00019841667766196068, + "loss": 0.0901, + "step": 65 + }, + { + "epoch": 0.018456265242897632, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00019828473413379075, + "loss": 0.0091, + "step": 70 + }, + { + "epoch": 0.019774569903104607, + "grad_norm": 0.0079345703125, + "learning_rate": 0.0001981527906056208, + "loss": 0.0744, + "step": 75 + }, + { + "epoch": 0.021092874563311582, + "grad_norm": 0.65234375, + "learning_rate": 0.00019802084707745086, + "loss": 0.1108, + "step": 80 + }, + { + "epoch": 0.022411179223518554, + "grad_norm": 0.50390625, + "learning_rate": 0.0001978889035492809, + "loss": 0.0446, + "step": 85 + }, + { + "epoch": 0.02372948388372553, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019775696002111097, + "loss": 0.0982, + "step": 90 + }, + { + "epoch": 0.0250477885439325, + "grad_norm": 0.490234375, + "learning_rate": 0.00019762501649294104, + "loss": 0.1035, + "step": 95 + }, + { + "epoch": 0.026366093204139476, + "grad_norm": 0.12158203125, + "learning_rate": 0.00019749307296477108, + "loss": 0.0401, + "step": 100 + }, + { + "epoch": 0.02768439786434645, + "grad_norm": 0.16015625, + "learning_rate": 0.00019736112943660115, + "loss": 0.0309, + "step": 105 + }, + { + "epoch": 0.029002702524553423, + "grad_norm": 1.359375, + "learning_rate": 0.0001972291859084312, + "loss": 0.1032, + "step": 110 + }, + { + "epoch": 0.0303210071847604, + "grad_norm": 0.52734375, + "learning_rate": 0.00019709724238026126, + "loss": 0.0811, + "step": 115 + }, + { + "epoch": 0.03163931184496737, + "grad_norm": 0.177734375, + "learning_rate": 0.00019696529885209133, + "loss": 0.0258, + "step": 120 + }, + { + "epoch": 0.03295761650517435, + "grad_norm": 0.234375, + "learning_rate": 0.00019683335532392137, + "loss": 0.0437, + "step": 125 + }, + { + "epoch": 0.03427592116538132, + "grad_norm": 1.3046875, + "learning_rate": 0.00019670141179575144, + "loss": 0.0967, + "step": 130 + }, + { + "epoch": 0.03559422582558829, + "grad_norm": 0.2734375, + "learning_rate": 0.00019656946826758148, + "loss": 0.0132, + "step": 135 + }, + { + "epoch": 0.036912530485795264, + "grad_norm": 0.66015625, + "learning_rate": 0.00019643752473941155, + "loss": 0.0396, + "step": 140 + }, + { + "epoch": 0.03823083514600224, + "grad_norm": 1.0546875, + "learning_rate": 0.0001963055812112416, + "loss": 0.0449, + "step": 145 + }, + { + "epoch": 0.039549139806209214, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019617363768307166, + "loss": 0.1196, + "step": 150 + }, + { + "epoch": 0.040867444466416186, + "grad_norm": 0.5859375, + "learning_rate": 0.0001960416941549017, + "loss": 0.0588, + "step": 155 + }, + { + "epoch": 0.042185749126623165, + "grad_norm": 0.06005859375, + "learning_rate": 0.00019590975062673175, + "loss": 0.0234, + "step": 160 + }, + { + "epoch": 0.04350405378683014, + "grad_norm": 0.4921875, + "learning_rate": 0.00019577780709856182, + "loss": 0.0916, + "step": 165 + }, + { + "epoch": 0.04482235844703711, + "grad_norm": 0.84375, + "learning_rate": 0.0001956458635703919, + "loss": 0.0271, + "step": 170 + }, + { + "epoch": 0.04614066310724409, + "grad_norm": 0.8828125, + "learning_rate": 0.00019551392004222193, + "loss": 0.0175, + "step": 175 + }, + { + "epoch": 0.04745896776745106, + "grad_norm": 0.0152587890625, + "learning_rate": 0.000195381976514052, + "loss": 0.0356, + "step": 180 + }, + { + "epoch": 0.04877727242765803, + "grad_norm": 0.09326171875, + "learning_rate": 0.00019525003298588204, + "loss": 0.0057, + "step": 185 + }, + { + "epoch": 0.050095577087865, + "grad_norm": 0.24609375, + "learning_rate": 0.0001951180894577121, + "loss": 0.0082, + "step": 190 + }, + { + "epoch": 0.05141388174807198, + "grad_norm": 0.05029296875, + "learning_rate": 0.00019498614592954215, + "loss": 0.0178, + "step": 195 + }, + { + "epoch": 0.05273218640827895, + "grad_norm": 0.0390625, + "learning_rate": 0.00019485420240137222, + "loss": 0.0789, + "step": 200 + }, + { + "epoch": 0.054050491068485924, + "grad_norm": 0.5625, + "learning_rate": 0.0001947222588732023, + "loss": 0.0645, + "step": 205 + }, + { + "epoch": 0.0553687957286929, + "grad_norm": 0.53515625, + "learning_rate": 0.00019459031534503233, + "loss": 0.116, + "step": 210 + }, + { + "epoch": 0.056687100388899875, + "grad_norm": 0.55078125, + "learning_rate": 0.0001944583718168624, + "loss": 0.0516, + "step": 215 + }, + { + "epoch": 0.058005405049106847, + "grad_norm": 0.314453125, + "learning_rate": 0.00019432642828869244, + "loss": 0.1019, + "step": 220 + }, + { + "epoch": 0.059323709709313825, + "grad_norm": 0.1123046875, + "learning_rate": 0.0001941944847605225, + "loss": 0.0529, + "step": 225 + }, + { + "epoch": 0.0606420143695208, + "grad_norm": 0.4921875, + "learning_rate": 0.00019406254123235256, + "loss": 0.0368, + "step": 230 + }, + { + "epoch": 0.06196031902972777, + "grad_norm": 0.054443359375, + "learning_rate": 0.00019393059770418262, + "loss": 0.037, + "step": 235 + }, + { + "epoch": 0.06327862368993474, + "grad_norm": 0.008544921875, + "learning_rate": 0.0001937986541760127, + "loss": 0.0324, + "step": 240 + }, + { + "epoch": 0.06459692835014172, + "grad_norm": 1.5, + "learning_rate": 0.00019366671064784274, + "loss": 0.0334, + "step": 245 + }, + { + "epoch": 0.0659152330103487, + "grad_norm": 0.2109375, + "learning_rate": 0.0001935347671196728, + "loss": 0.0671, + "step": 250 + }, + { + "epoch": 0.06723353767055566, + "grad_norm": 2.0625, + "learning_rate": 0.00019340282359150285, + "loss": 0.1559, + "step": 255 + }, + { + "epoch": 0.06855184233076264, + "grad_norm": 0.7734375, + "learning_rate": 0.0001932708800633329, + "loss": 0.0198, + "step": 260 + }, + { + "epoch": 0.06987014699096962, + "grad_norm": 0.42578125, + "learning_rate": 0.00019313893653516296, + "loss": 0.0151, + "step": 265 + }, + { + "epoch": 0.07118845165117658, + "grad_norm": 0.1884765625, + "learning_rate": 0.000193006993006993, + "loss": 0.0269, + "step": 270 + }, + { + "epoch": 0.07250675631138356, + "grad_norm": 1.546875, + "learning_rate": 0.00019287504947882307, + "loss": 0.0565, + "step": 275 + }, + { + "epoch": 0.07382506097159053, + "grad_norm": 0.5078125, + "learning_rate": 0.0001927431059506531, + "loss": 0.0942, + "step": 280 + }, + { + "epoch": 0.0751433656317975, + "grad_norm": 0.392578125, + "learning_rate": 0.00019261116242248318, + "loss": 0.0061, + "step": 285 + }, + { + "epoch": 0.07646167029200449, + "grad_norm": 1.9140625, + "learning_rate": 0.00019247921889431325, + "loss": 0.0497, + "step": 290 + }, + { + "epoch": 0.07777997495221145, + "grad_norm": 0.08837890625, + "learning_rate": 0.0001923472753661433, + "loss": 0.0573, + "step": 295 + }, + { + "epoch": 0.07909827961241843, + "grad_norm": 1.046875, + "learning_rate": 0.00019221533183797336, + "loss": 0.0528, + "step": 300 + }, + { + "epoch": 0.08041658427262541, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001920833883098034, + "loss": 0.0506, + "step": 305 + }, + { + "epoch": 0.08173488893283237, + "grad_norm": 0.08203125, + "learning_rate": 0.00019195144478163347, + "loss": 0.0307, + "step": 310 + }, + { + "epoch": 0.08305319359303935, + "grad_norm": 0.111328125, + "learning_rate": 0.00019181950125346354, + "loss": 0.0365, + "step": 315 + }, + { + "epoch": 0.08437149825324633, + "grad_norm": 1.2890625, + "learning_rate": 0.00019168755772529358, + "loss": 0.0447, + "step": 320 + }, + { + "epoch": 0.0856898029134533, + "grad_norm": 0.6015625, + "learning_rate": 0.00019155561419712365, + "loss": 0.0605, + "step": 325 + }, + { + "epoch": 0.08700810757366027, + "grad_norm": 0.71875, + "learning_rate": 0.0001914236706689537, + "loss": 0.0846, + "step": 330 + }, + { + "epoch": 0.08832641223386725, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019129172714078376, + "loss": 0.0713, + "step": 335 + }, + { + "epoch": 0.08964471689407422, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001911597836126138, + "loss": 0.0826, + "step": 340 + }, + { + "epoch": 0.0909630215542812, + "grad_norm": 2.203125, + "learning_rate": 0.00019102784008444388, + "loss": 0.0441, + "step": 345 + }, + { + "epoch": 0.09228132621448817, + "grad_norm": 1.21875, + "learning_rate": 0.00019089589655627395, + "loss": 0.1378, + "step": 350 + }, + { + "epoch": 0.09359963087469514, + "grad_norm": 3.0625, + "learning_rate": 0.00019076395302810396, + "loss": 0.1552, + "step": 355 + }, + { + "epoch": 0.09491793553490212, + "grad_norm": 0.232421875, + "learning_rate": 0.00019063200949993403, + "loss": 0.0458, + "step": 360 + }, + { + "epoch": 0.0962362401951091, + "grad_norm": 0.71875, + "learning_rate": 0.0001905000659717641, + "loss": 0.0312, + "step": 365 + }, + { + "epoch": 0.09755454485531606, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00019036812244359414, + "loss": 0.0247, + "step": 370 + }, + { + "epoch": 0.09887284951552304, + "grad_norm": 0.064453125, + "learning_rate": 0.0001902361789154242, + "loss": 0.054, + "step": 375 + }, + { + "epoch": 0.10019115417573, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019010423538725425, + "loss": 0.0023, + "step": 380 + }, + { + "epoch": 0.10150945883593698, + "grad_norm": 0.0361328125, + "learning_rate": 0.00018997229185908432, + "loss": 0.0884, + "step": 385 + }, + { + "epoch": 0.10282776349614396, + "grad_norm": 1.703125, + "learning_rate": 0.00018984034833091436, + "loss": 0.0506, + "step": 390 + }, + { + "epoch": 0.10414606815635093, + "grad_norm": 0.08837890625, + "learning_rate": 0.00018970840480274443, + "loss": 0.1123, + "step": 395 + }, + { + "epoch": 0.1054643728165579, + "grad_norm": 0.6953125, + "learning_rate": 0.0001895764612745745, + "loss": 0.0597, + "step": 400 + }, + { + "epoch": 0.10678267747676488, + "grad_norm": 0.18359375, + "learning_rate": 0.00018944451774640454, + "loss": 0.0138, + "step": 405 + }, + { + "epoch": 0.10810098213697185, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001893125742182346, + "loss": 0.0249, + "step": 410 + }, + { + "epoch": 0.10941928679717883, + "grad_norm": 0.00970458984375, + "learning_rate": 0.00018918063069006466, + "loss": 0.0084, + "step": 415 + }, + { + "epoch": 0.1107375914573858, + "grad_norm": 0.54296875, + "learning_rate": 0.00018904868716189472, + "loss": 0.0541, + "step": 420 + }, + { + "epoch": 0.11205589611759277, + "grad_norm": 0.74609375, + "learning_rate": 0.00018891674363372477, + "loss": 0.007, + "step": 425 + }, + { + "epoch": 0.11337420077779975, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00018878480010555484, + "loss": 0.0875, + "step": 430 + }, + { + "epoch": 0.11469250543800673, + "grad_norm": 0.9296875, + "learning_rate": 0.0001886528565773849, + "loss": 0.1207, + "step": 435 + }, + { + "epoch": 0.11601081009821369, + "grad_norm": 1.2734375, + "learning_rate": 0.00018852091304921495, + "loss": 0.1143, + "step": 440 + }, + { + "epoch": 0.11732911475842067, + "grad_norm": 0.6484375, + "learning_rate": 0.00018838896952104502, + "loss": 0.0393, + "step": 445 + }, + { + "epoch": 0.11864741941862765, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018825702599287506, + "loss": 0.02, + "step": 450 + }, + { + "epoch": 0.11996572407883462, + "grad_norm": 0.486328125, + "learning_rate": 0.0001881250824647051, + "loss": 0.0891, + "step": 455 + }, + { + "epoch": 0.1212840287390416, + "grad_norm": 1.0, + "learning_rate": 0.00018799313893653517, + "loss": 0.0469, + "step": 460 + }, + { + "epoch": 0.12260233339924857, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001878611954083652, + "loss": 0.019, + "step": 465 + }, + { + "epoch": 0.12392063805945554, + "grad_norm": 0.03857421875, + "learning_rate": 0.00018772925188019528, + "loss": 0.007, + "step": 470 + }, + { + "epoch": 0.12523894271966252, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00018759730835202532, + "loss": 0.0039, + "step": 475 + }, + { + "epoch": 0.12655724737986948, + "grad_norm": 0.014404296875, + "learning_rate": 0.0001874653648238554, + "loss": 0.0043, + "step": 480 + }, + { + "epoch": 0.12787555204007647, + "grad_norm": 0.51953125, + "learning_rate": 0.00018733342129568546, + "loss": 0.1326, + "step": 485 + }, + { + "epoch": 0.12919385670028344, + "grad_norm": 0.99609375, + "learning_rate": 0.0001872014777675155, + "loss": 0.0369, + "step": 490 + }, + { + "epoch": 0.1305121613604904, + "grad_norm": 0.2734375, + "learning_rate": 0.00018706953423934557, + "loss": 0.0395, + "step": 495 + }, + { + "epoch": 0.1318304660206974, + "grad_norm": 0.083984375, + "learning_rate": 0.00018693759071117561, + "loss": 0.0284, + "step": 500 + }, + { + "epoch": 0.1318304660206974, + "eval_loss": 0.04542969539761543, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.5293, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 3.685, + "step": 500 + }, + { + "epoch": 0.13314877068090436, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00018680564718300568, + "loss": 0.0533, + "step": 505 + }, + { + "epoch": 0.13446707534111133, + "grad_norm": 0.71484375, + "learning_rate": 0.00018667370365483575, + "loss": 0.0183, + "step": 510 + }, + { + "epoch": 0.13578538000131832, + "grad_norm": 0.018798828125, + "learning_rate": 0.0001865417601266658, + "loss": 0.0473, + "step": 515 + }, + { + "epoch": 0.13710368466152528, + "grad_norm": 0.388671875, + "learning_rate": 0.00018640981659849586, + "loss": 0.0562, + "step": 520 + }, + { + "epoch": 0.13842198932173225, + "grad_norm": 0.77734375, + "learning_rate": 0.0001862778730703259, + "loss": 0.0755, + "step": 525 + }, + { + "epoch": 0.13974029398193924, + "grad_norm": 2.8125, + "learning_rate": 0.00018614592954215598, + "loss": 0.0422, + "step": 530 + }, + { + "epoch": 0.1410585986421462, + "grad_norm": 0.48828125, + "learning_rate": 0.00018601398601398602, + "loss": 0.0882, + "step": 535 + }, + { + "epoch": 0.14237690330235317, + "grad_norm": 0.16015625, + "learning_rate": 0.0001858820424858161, + "loss": 0.0131, + "step": 540 + }, + { + "epoch": 0.14369520796256013, + "grad_norm": 0.31640625, + "learning_rate": 0.00018575009895764616, + "loss": 0.03, + "step": 545 + }, + { + "epoch": 0.14501351262276713, + "grad_norm": 0.0120849609375, + "learning_rate": 0.0001856181554294762, + "loss": 0.0425, + "step": 550 + }, + { + "epoch": 0.1463318172829741, + "grad_norm": 0.390625, + "learning_rate": 0.00018548621190130624, + "loss": 0.011, + "step": 555 + }, + { + "epoch": 0.14765012194318106, + "grad_norm": 1.9609375, + "learning_rate": 0.0001853542683731363, + "loss": 0.0807, + "step": 560 + }, + { + "epoch": 0.14896842660338805, + "grad_norm": 0.609375, + "learning_rate": 0.00018522232484496635, + "loss": 0.0278, + "step": 565 + }, + { + "epoch": 0.150286731263595, + "grad_norm": 0.087890625, + "learning_rate": 0.00018509038131679642, + "loss": 0.0484, + "step": 570 + }, + { + "epoch": 0.15160503592380198, + "grad_norm": 0.5078125, + "learning_rate": 0.00018495843778862646, + "loss": 0.1277, + "step": 575 + }, + { + "epoch": 0.15292334058400897, + "grad_norm": 0.8125, + "learning_rate": 0.00018482649426045653, + "loss": 0.058, + "step": 580 + }, + { + "epoch": 0.15424164524421594, + "grad_norm": 0.22265625, + "learning_rate": 0.00018469455073228657, + "loss": 0.0259, + "step": 585 + }, + { + "epoch": 0.1555599499044229, + "grad_norm": 1.8984375, + "learning_rate": 0.00018456260720411664, + "loss": 0.113, + "step": 590 + }, + { + "epoch": 0.1568782545646299, + "grad_norm": 0.12451171875, + "learning_rate": 0.0001844306636759467, + "loss": 0.0312, + "step": 595 + }, + { + "epoch": 0.15819655922483686, + "grad_norm": 0.0322265625, + "learning_rate": 0.00018429872014777676, + "loss": 0.0476, + "step": 600 + }, + { + "epoch": 0.15951486388504382, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00018416677661960682, + "loss": 0.0232, + "step": 605 + }, + { + "epoch": 0.16083316854525082, + "grad_norm": 0.57421875, + "learning_rate": 0.00018403483309143687, + "loss": 0.1287, + "step": 610 + }, + { + "epoch": 0.16215147320545778, + "grad_norm": 0.765625, + "learning_rate": 0.00018390288956326694, + "loss": 0.0991, + "step": 615 + }, + { + "epoch": 0.16346977786566474, + "grad_norm": 0.3125, + "learning_rate": 0.00018377094603509698, + "loss": 0.0247, + "step": 620 + }, + { + "epoch": 0.16478808252587174, + "grad_norm": 0.37890625, + "learning_rate": 0.00018363900250692705, + "loss": 0.0632, + "step": 625 + }, + { + "epoch": 0.1661063871860787, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018350705897875712, + "loss": 0.0314, + "step": 630 + }, + { + "epoch": 0.16742469184628567, + "grad_norm": 0.0673828125, + "learning_rate": 0.00018337511545058716, + "loss": 0.0425, + "step": 635 + }, + { + "epoch": 0.16874299650649266, + "grad_norm": 0.396484375, + "learning_rate": 0.00018324317192241723, + "loss": 0.0613, + "step": 640 + }, + { + "epoch": 0.17006130116669962, + "grad_norm": 0.057373046875, + "learning_rate": 0.00018311122839424727, + "loss": 0.0569, + "step": 645 + }, + { + "epoch": 0.1713796058269066, + "grad_norm": 0.001373291015625, + "learning_rate": 0.00018297928486607734, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 0.17269791048711358, + "grad_norm": 1.0859375, + "learning_rate": 0.00018284734133790738, + "loss": 0.0189, + "step": 655 + }, + { + "epoch": 0.17401621514732055, + "grad_norm": 0.6015625, + "learning_rate": 0.00018271539780973742, + "loss": 0.0601, + "step": 660 + }, + { + "epoch": 0.1753345198075275, + "grad_norm": 0.25390625, + "learning_rate": 0.0001825834542815675, + "loss": 0.0211, + "step": 665 + }, + { + "epoch": 0.1766528244677345, + "grad_norm": 2.6875, + "learning_rate": 0.00018245151075339753, + "loss": 0.0713, + "step": 670 + }, + { + "epoch": 0.17797112912794147, + "grad_norm": 1.1875, + "learning_rate": 0.0001823195672252276, + "loss": 0.0522, + "step": 675 + }, + { + "epoch": 0.17928943378814843, + "grad_norm": 0.025146484375, + "learning_rate": 0.00018218762369705767, + "loss": 0.0242, + "step": 680 + }, + { + "epoch": 0.18060773844835543, + "grad_norm": 0.048095703125, + "learning_rate": 0.00018205568016888772, + "loss": 0.0129, + "step": 685 + }, + { + "epoch": 0.1819260431085624, + "grad_norm": 0.04541015625, + "learning_rate": 0.00018192373664071778, + "loss": 0.0142, + "step": 690 + }, + { + "epoch": 0.18324434776876936, + "grad_norm": 0.00830078125, + "learning_rate": 0.00018179179311254783, + "loss": 0.0121, + "step": 695 + }, + { + "epoch": 0.18456265242897635, + "grad_norm": 0.53125, + "learning_rate": 0.0001816598495843779, + "loss": 0.0163, + "step": 700 + }, + { + "epoch": 0.1858809570891833, + "grad_norm": 0.185546875, + "learning_rate": 0.00018152790605620796, + "loss": 0.0203, + "step": 705 + }, + { + "epoch": 0.18719926174939028, + "grad_norm": 1.2578125, + "learning_rate": 0.000181395962528038, + "loss": 0.1548, + "step": 710 + }, + { + "epoch": 0.18851756640959727, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00018126401899986808, + "loss": 0.0543, + "step": 715 + }, + { + "epoch": 0.18983587106980424, + "grad_norm": 0.07568359375, + "learning_rate": 0.00018113207547169812, + "loss": 0.0346, + "step": 720 + }, + { + "epoch": 0.1911541757300112, + "grad_norm": 0.1318359375, + "learning_rate": 0.0001810001319435282, + "loss": 0.03, + "step": 725 + }, + { + "epoch": 0.1924724803902182, + "grad_norm": 0.1455078125, + "learning_rate": 0.00018086818841535823, + "loss": 0.0796, + "step": 730 + }, + { + "epoch": 0.19379078505042516, + "grad_norm": 0.09814453125, + "learning_rate": 0.0001807362448871883, + "loss": 0.0662, + "step": 735 + }, + { + "epoch": 0.19510908971063212, + "grad_norm": 0.91015625, + "learning_rate": 0.00018060430135901837, + "loss": 0.0675, + "step": 740 + }, + { + "epoch": 0.19642739437083911, + "grad_norm": 0.10693359375, + "learning_rate": 0.0001804723578308484, + "loss": 0.0377, + "step": 745 + }, + { + "epoch": 0.19774569903104608, + "grad_norm": 0.95703125, + "learning_rate": 0.00018034041430267848, + "loss": 0.0174, + "step": 750 + }, + { + "epoch": 0.19906400369125304, + "grad_norm": 1.7890625, + "learning_rate": 0.00018020847077450852, + "loss": 0.0278, + "step": 755 + }, + { + "epoch": 0.20038230835146, + "grad_norm": 0.8515625, + "learning_rate": 0.00018007652724633856, + "loss": 0.0113, + "step": 760 + }, + { + "epoch": 0.201700613011667, + "grad_norm": 0.016845703125, + "learning_rate": 0.00017994458371816863, + "loss": 0.0589, + "step": 765 + }, + { + "epoch": 0.20301891767187397, + "grad_norm": 0.01043701171875, + "learning_rate": 0.00017981264018999867, + "loss": 0.0203, + "step": 770 + }, + { + "epoch": 0.20433722233208093, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00017968069666182874, + "loss": 0.0494, + "step": 775 + }, + { + "epoch": 0.20565552699228792, + "grad_norm": 0.56640625, + "learning_rate": 0.00017954875313365879, + "loss": 0.0394, + "step": 780 + }, + { + "epoch": 0.2069738316524949, + "grad_norm": 0.06591796875, + "learning_rate": 0.00017941680960548886, + "loss": 0.0848, + "step": 785 + }, + { + "epoch": 0.20829213631270185, + "grad_norm": 0.40234375, + "learning_rate": 0.00017928486607731892, + "loss": 0.0464, + "step": 790 + }, + { + "epoch": 0.20961044097290885, + "grad_norm": 0.06298828125, + "learning_rate": 0.00017915292254914897, + "loss": 0.0222, + "step": 795 + }, + { + "epoch": 0.2109287456331158, + "grad_norm": 0.5390625, + "learning_rate": 0.00017902097902097904, + "loss": 0.0434, + "step": 800 + }, + { + "epoch": 0.21224705029332278, + "grad_norm": 1.390625, + "learning_rate": 0.00017888903549280908, + "loss": 0.0222, + "step": 805 + }, + { + "epoch": 0.21356535495352977, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00017875709196463915, + "loss": 0.0099, + "step": 810 + }, + { + "epoch": 0.21488365961373673, + "grad_norm": 0.10009765625, + "learning_rate": 0.0001786251484364692, + "loss": 0.0086, + "step": 815 + }, + { + "epoch": 0.2162019642739437, + "grad_norm": 0.06396484375, + "learning_rate": 0.00017849320490829926, + "loss": 0.0715, + "step": 820 + }, + { + "epoch": 0.2175202689341507, + "grad_norm": 0.365234375, + "learning_rate": 0.00017836126138012933, + "loss": 0.0642, + "step": 825 + }, + { + "epoch": 0.21883857359435765, + "grad_norm": 0.01519775390625, + "learning_rate": 0.00017822931785195937, + "loss": 0.0111, + "step": 830 + }, + { + "epoch": 0.22015687825456462, + "grad_norm": 1.1640625, + "learning_rate": 0.00017809737432378944, + "loss": 0.0518, + "step": 835 + }, + { + "epoch": 0.2214751829147716, + "grad_norm": 0.00921630859375, + "learning_rate": 0.00017796543079561948, + "loss": 0.0384, + "step": 840 + }, + { + "epoch": 0.22279348757497858, + "grad_norm": 0.33984375, + "learning_rate": 0.00017783348726744955, + "loss": 0.0204, + "step": 845 + }, + { + "epoch": 0.22411179223518554, + "grad_norm": 0.294921875, + "learning_rate": 0.00017770154373927962, + "loss": 0.0075, + "step": 850 + }, + { + "epoch": 0.22543009689539253, + "grad_norm": 0.033203125, + "learning_rate": 0.00017756960021110963, + "loss": 0.0895, + "step": 855 + }, + { + "epoch": 0.2267484015555995, + "grad_norm": 0.08056640625, + "learning_rate": 0.0001774376566829397, + "loss": 0.1039, + "step": 860 + }, + { + "epoch": 0.22806670621580646, + "grad_norm": 0.55078125, + "learning_rate": 0.00017730571315476975, + "loss": 0.0125, + "step": 865 + }, + { + "epoch": 0.22938501087601346, + "grad_norm": 0.5859375, + "learning_rate": 0.00017717376962659982, + "loss": 0.0381, + "step": 870 + }, + { + "epoch": 0.23070331553622042, + "grad_norm": 0.029052734375, + "learning_rate": 0.00017704182609842988, + "loss": 0.0434, + "step": 875 + }, + { + "epoch": 0.23202162019642739, + "grad_norm": 0.43359375, + "learning_rate": 0.00017690988257025993, + "loss": 0.0799, + "step": 880 + }, + { + "epoch": 0.23333992485663438, + "grad_norm": 0.04150390625, + "learning_rate": 0.00017677793904209, + "loss": 0.0692, + "step": 885 + }, + { + "epoch": 0.23465822951684134, + "grad_norm": 0.435546875, + "learning_rate": 0.00017664599551392004, + "loss": 0.0544, + "step": 890 + }, + { + "epoch": 0.2359765341770483, + "grad_norm": 1.171875, + "learning_rate": 0.0001765140519857501, + "loss": 0.0619, + "step": 895 + }, + { + "epoch": 0.2372948388372553, + "grad_norm": 0.01263427734375, + "learning_rate": 0.00017638210845758018, + "loss": 0.0418, + "step": 900 + }, + { + "epoch": 0.23861314349746227, + "grad_norm": 0.017578125, + "learning_rate": 0.00017625016492941022, + "loss": 0.0195, + "step": 905 + }, + { + "epoch": 0.23993144815766923, + "grad_norm": 0.6171875, + "learning_rate": 0.0001761182214012403, + "loss": 0.067, + "step": 910 + }, + { + "epoch": 0.24124975281787622, + "grad_norm": 0.59765625, + "learning_rate": 0.00017598627787307033, + "loss": 0.049, + "step": 915 + }, + { + "epoch": 0.2425680574780832, + "grad_norm": 1.2421875, + "learning_rate": 0.0001758543343449004, + "loss": 0.0539, + "step": 920 + }, + { + "epoch": 0.24388636213829015, + "grad_norm": 0.10302734375, + "learning_rate": 0.00017572239081673044, + "loss": 0.0725, + "step": 925 + }, + { + "epoch": 0.24520466679849715, + "grad_norm": 0.330078125, + "learning_rate": 0.0001755904472885605, + "loss": 0.064, + "step": 930 + }, + { + "epoch": 0.2465229714587041, + "grad_norm": 0.220703125, + "learning_rate": 0.00017545850376039058, + "loss": 0.0271, + "step": 935 + }, + { + "epoch": 0.24784127611891107, + "grad_norm": 0.01470947265625, + "learning_rate": 0.00017532656023222062, + "loss": 0.0247, + "step": 940 + }, + { + "epoch": 0.24915958077911807, + "grad_norm": 0.013427734375, + "learning_rate": 0.0001751946167040507, + "loss": 0.017, + "step": 945 + }, + { + "epoch": 0.25047788543932503, + "grad_norm": 0.58984375, + "learning_rate": 0.00017506267317588073, + "loss": 0.0254, + "step": 950 + }, + { + "epoch": 0.251796190099532, + "grad_norm": 0.412109375, + "learning_rate": 0.00017493072964771078, + "loss": 0.0186, + "step": 955 + }, + { + "epoch": 0.25311449475973896, + "grad_norm": 0.66796875, + "learning_rate": 0.00017479878611954084, + "loss": 0.0617, + "step": 960 + }, + { + "epoch": 0.25443279941994595, + "grad_norm": 0.322265625, + "learning_rate": 0.00017466684259137089, + "loss": 0.0173, + "step": 965 + }, + { + "epoch": 0.25575110408015295, + "grad_norm": 0.83203125, + "learning_rate": 0.00017453489906320096, + "loss": 0.0512, + "step": 970 + }, + { + "epoch": 0.2570694087403599, + "grad_norm": 0.08447265625, + "learning_rate": 0.000174402955535031, + "loss": 0.0361, + "step": 975 + }, + { + "epoch": 0.2583877134005669, + "grad_norm": 0.423828125, + "learning_rate": 0.00017427101200686107, + "loss": 0.0175, + "step": 980 + }, + { + "epoch": 0.25970601806077387, + "grad_norm": 0.77734375, + "learning_rate": 0.00017413906847869114, + "loss": 0.0139, + "step": 985 + }, + { + "epoch": 0.2610243227209808, + "grad_norm": 0.515625, + "learning_rate": 0.00017400712495052118, + "loss": 0.0948, + "step": 990 + }, + { + "epoch": 0.2623426273811878, + "grad_norm": 1.421875, + "learning_rate": 0.00017387518142235125, + "loss": 0.0406, + "step": 995 + }, + { + "epoch": 0.2636609320413948, + "grad_norm": 0.058837890625, + "learning_rate": 0.0001737432378941813, + "loss": 0.1011, + "step": 1000 + }, + { + "epoch": 0.2636609320413948, + "eval_loss": 0.045552924275398254, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.6113, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 3.684, + "step": 1000 + } + ], + "logging_steps": 5, + "max_steps": 7584, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.07022836673536e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d172b090b85619dc1adf130a66b333f6491d2342 --- /dev/null +++ b/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cee8b68ab8cbec1968c4eec747022ea57cd350072097ab37179dd29309655d7 +size 5688 diff --git a/checkpoint-1500/README.md b/checkpoint-1500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e738b5eda9883f4a45efd9d37b8b8357ba758456 --- /dev/null +++ b/checkpoint-1500/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/Llama-3.2-3B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-1500/adapter_config.json b/checkpoint-1500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..083327e0c9f3dd1a9eda2bc8b558f75005b93594 --- /dev/null +++ b/checkpoint-1500/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/Llama-3.2-3B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "q_proj", + "v_proj", + "k_proj", + "down_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1500/adapter_model.safetensors b/checkpoint-1500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d8057545b41b1021859cf5c7a57f0add3d242ed7 --- /dev/null +++ b/checkpoint-1500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e71210cede591304d9f8e8baa28687d2dcc8e875430074ed063c2323e4df27de +size 48680136 diff --git a/checkpoint-1500/optimizer.pt b/checkpoint-1500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..24912430b83e07632ff83c12f092afeb86fed8a6 --- /dev/null +++ b/checkpoint-1500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d59fe0c31f6a03193e69e338856a0c6ee57c366f05092f94f1d1272bfc4d1a42 +size 49846644 diff --git a/checkpoint-1500/rng_state.pth b/checkpoint-1500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..72fb1e0a1617ca54de275aab22cabeae4eb430e3 --- /dev/null +++ b/checkpoint-1500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ba77c4358d5913436545fc6995706402cd54ccf015646708e622eca7f93ed87 +size 14244 diff --git a/checkpoint-1500/scheduler.pt b/checkpoint-1500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ccc5e9cc4e83407dced9e9c5982540bd6dc81d0 --- /dev/null +++ b/checkpoint-1500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8e488f9cd81d6d035d07c333b21224930ff58c17d27dbd21d9101e5eca05d7d +size 1064 diff --git a/checkpoint-1500/special_tokens_map.json b/checkpoint-1500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-1500/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1500/tokenizer.json b/checkpoint-1500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-1500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-1500/tokenizer_config.json b/checkpoint-1500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b505f7771010e0b872d94ddb23916f0e9ebe5d6f --- /dev/null +++ b/checkpoint-1500/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 July 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": null +} diff --git a/checkpoint-1500/trainer_state.json b/checkpoint-1500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1407ac202d133a56fcce7d0cb777dab28da0367f --- /dev/null +++ b/checkpoint-1500/trainer_state.json @@ -0,0 +1,2160 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.39549139806209216, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001318304660206974, + "grad_norm": 4.59375, + "learning_rate": 0.0002, + "loss": 1.9624, + "step": 5 + }, + { + "epoch": 0.002636609320413948, + "grad_norm": 1.7421875, + "learning_rate": 0.00019986805647183008, + "loss": 0.6513, + "step": 10 + }, + { + "epoch": 0.003954913980620921, + "grad_norm": 1.84375, + "learning_rate": 0.00019973611294366012, + "loss": 0.1146, + "step": 15 + }, + { + "epoch": 0.005273218640827896, + "grad_norm": 1.3203125, + "learning_rate": 0.0001996041694154902, + "loss": 0.0529, + "step": 20 + }, + { + "epoch": 0.006591523301034869, + "grad_norm": 0.40234375, + "learning_rate": 0.00019947222588732023, + "loss": 0.1214, + "step": 25 + }, + { + "epoch": 0.007909827961241843, + "grad_norm": 1.5390625, + "learning_rate": 0.0001993402823591503, + "loss": 0.0919, + "step": 30 + }, + { + "epoch": 0.009228132621448816, + "grad_norm": 0.06201171875, + "learning_rate": 0.00019920833883098034, + "loss": 0.09, + "step": 35 + }, + { + "epoch": 0.010546437281655791, + "grad_norm": 1.53125, + "learning_rate": 0.0001990763953028104, + "loss": 0.1945, + "step": 40 + }, + { + "epoch": 0.011864741941862765, + "grad_norm": 0.2890625, + "learning_rate": 0.00019894445177464048, + "loss": 0.1259, + "step": 45 + }, + { + "epoch": 0.013183046602069738, + "grad_norm": 0.609375, + "learning_rate": 0.00019881250824647052, + "loss": 0.027, + "step": 50 + }, + { + "epoch": 0.014501351262276712, + "grad_norm": 0.369140625, + "learning_rate": 0.00019868056471830057, + "loss": 0.1068, + "step": 55 + }, + { + "epoch": 0.015819655922483685, + "grad_norm": 0.34765625, + "learning_rate": 0.00019854862119013064, + "loss": 0.0542, + "step": 60 + }, + { + "epoch": 0.01713796058269066, + "grad_norm": 0.055419921875, + "learning_rate": 0.00019841667766196068, + "loss": 0.0901, + "step": 65 + }, + { + "epoch": 0.018456265242897632, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00019828473413379075, + "loss": 0.0091, + "step": 70 + }, + { + "epoch": 0.019774569903104607, + "grad_norm": 0.0079345703125, + "learning_rate": 0.0001981527906056208, + "loss": 0.0744, + "step": 75 + }, + { + "epoch": 0.021092874563311582, + "grad_norm": 0.65234375, + "learning_rate": 0.00019802084707745086, + "loss": 0.1108, + "step": 80 + }, + { + "epoch": 0.022411179223518554, + "grad_norm": 0.50390625, + "learning_rate": 0.0001978889035492809, + "loss": 0.0446, + "step": 85 + }, + { + "epoch": 0.02372948388372553, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019775696002111097, + "loss": 0.0982, + "step": 90 + }, + { + "epoch": 0.0250477885439325, + "grad_norm": 0.490234375, + "learning_rate": 0.00019762501649294104, + "loss": 0.1035, + "step": 95 + }, + { + "epoch": 0.026366093204139476, + "grad_norm": 0.12158203125, + "learning_rate": 0.00019749307296477108, + "loss": 0.0401, + "step": 100 + }, + { + "epoch": 0.02768439786434645, + "grad_norm": 0.16015625, + "learning_rate": 0.00019736112943660115, + "loss": 0.0309, + "step": 105 + }, + { + "epoch": 0.029002702524553423, + "grad_norm": 1.359375, + "learning_rate": 0.0001972291859084312, + "loss": 0.1032, + "step": 110 + }, + { + "epoch": 0.0303210071847604, + "grad_norm": 0.52734375, + "learning_rate": 0.00019709724238026126, + "loss": 0.0811, + "step": 115 + }, + { + "epoch": 0.03163931184496737, + "grad_norm": 0.177734375, + "learning_rate": 0.00019696529885209133, + "loss": 0.0258, + "step": 120 + }, + { + "epoch": 0.03295761650517435, + "grad_norm": 0.234375, + "learning_rate": 0.00019683335532392137, + "loss": 0.0437, + "step": 125 + }, + { + "epoch": 0.03427592116538132, + "grad_norm": 1.3046875, + "learning_rate": 0.00019670141179575144, + "loss": 0.0967, + "step": 130 + }, + { + "epoch": 0.03559422582558829, + "grad_norm": 0.2734375, + "learning_rate": 0.00019656946826758148, + "loss": 0.0132, + "step": 135 + }, + { + "epoch": 0.036912530485795264, + "grad_norm": 0.66015625, + "learning_rate": 0.00019643752473941155, + "loss": 0.0396, + "step": 140 + }, + { + "epoch": 0.03823083514600224, + "grad_norm": 1.0546875, + "learning_rate": 0.0001963055812112416, + "loss": 0.0449, + "step": 145 + }, + { + "epoch": 0.039549139806209214, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019617363768307166, + "loss": 0.1196, + "step": 150 + }, + { + "epoch": 0.040867444466416186, + "grad_norm": 0.5859375, + "learning_rate": 0.0001960416941549017, + "loss": 0.0588, + "step": 155 + }, + { + "epoch": 0.042185749126623165, + "grad_norm": 0.06005859375, + "learning_rate": 0.00019590975062673175, + "loss": 0.0234, + "step": 160 + }, + { + "epoch": 0.04350405378683014, + "grad_norm": 0.4921875, + "learning_rate": 0.00019577780709856182, + "loss": 0.0916, + "step": 165 + }, + { + "epoch": 0.04482235844703711, + "grad_norm": 0.84375, + "learning_rate": 0.0001956458635703919, + "loss": 0.0271, + "step": 170 + }, + { + "epoch": 0.04614066310724409, + "grad_norm": 0.8828125, + "learning_rate": 0.00019551392004222193, + "loss": 0.0175, + "step": 175 + }, + { + "epoch": 0.04745896776745106, + "grad_norm": 0.0152587890625, + "learning_rate": 0.000195381976514052, + "loss": 0.0356, + "step": 180 + }, + { + "epoch": 0.04877727242765803, + "grad_norm": 0.09326171875, + "learning_rate": 0.00019525003298588204, + "loss": 0.0057, + "step": 185 + }, + { + "epoch": 0.050095577087865, + "grad_norm": 0.24609375, + "learning_rate": 0.0001951180894577121, + "loss": 0.0082, + "step": 190 + }, + { + "epoch": 0.05141388174807198, + "grad_norm": 0.05029296875, + "learning_rate": 0.00019498614592954215, + "loss": 0.0178, + "step": 195 + }, + { + "epoch": 0.05273218640827895, + "grad_norm": 0.0390625, + "learning_rate": 0.00019485420240137222, + "loss": 0.0789, + "step": 200 + }, + { + "epoch": 0.054050491068485924, + "grad_norm": 0.5625, + "learning_rate": 0.0001947222588732023, + "loss": 0.0645, + "step": 205 + }, + { + "epoch": 0.0553687957286929, + "grad_norm": 0.53515625, + "learning_rate": 0.00019459031534503233, + "loss": 0.116, + "step": 210 + }, + { + "epoch": 0.056687100388899875, + "grad_norm": 0.55078125, + "learning_rate": 0.0001944583718168624, + "loss": 0.0516, + "step": 215 + }, + { + "epoch": 0.058005405049106847, + "grad_norm": 0.314453125, + "learning_rate": 0.00019432642828869244, + "loss": 0.1019, + "step": 220 + }, + { + "epoch": 0.059323709709313825, + "grad_norm": 0.1123046875, + "learning_rate": 0.0001941944847605225, + "loss": 0.0529, + "step": 225 + }, + { + "epoch": 0.0606420143695208, + "grad_norm": 0.4921875, + "learning_rate": 0.00019406254123235256, + "loss": 0.0368, + "step": 230 + }, + { + "epoch": 0.06196031902972777, + "grad_norm": 0.054443359375, + "learning_rate": 0.00019393059770418262, + "loss": 0.037, + "step": 235 + }, + { + "epoch": 0.06327862368993474, + "grad_norm": 0.008544921875, + "learning_rate": 0.0001937986541760127, + "loss": 0.0324, + "step": 240 + }, + { + "epoch": 0.06459692835014172, + "grad_norm": 1.5, + "learning_rate": 0.00019366671064784274, + "loss": 0.0334, + "step": 245 + }, + { + "epoch": 0.0659152330103487, + "grad_norm": 0.2109375, + "learning_rate": 0.0001935347671196728, + "loss": 0.0671, + "step": 250 + }, + { + "epoch": 0.06723353767055566, + "grad_norm": 2.0625, + "learning_rate": 0.00019340282359150285, + "loss": 0.1559, + "step": 255 + }, + { + "epoch": 0.06855184233076264, + "grad_norm": 0.7734375, + "learning_rate": 0.0001932708800633329, + "loss": 0.0198, + "step": 260 + }, + { + "epoch": 0.06987014699096962, + "grad_norm": 0.42578125, + "learning_rate": 0.00019313893653516296, + "loss": 0.0151, + "step": 265 + }, + { + "epoch": 0.07118845165117658, + "grad_norm": 0.1884765625, + "learning_rate": 0.000193006993006993, + "loss": 0.0269, + "step": 270 + }, + { + "epoch": 0.07250675631138356, + "grad_norm": 1.546875, + "learning_rate": 0.00019287504947882307, + "loss": 0.0565, + "step": 275 + }, + { + "epoch": 0.07382506097159053, + "grad_norm": 0.5078125, + "learning_rate": 0.0001927431059506531, + "loss": 0.0942, + "step": 280 + }, + { + "epoch": 0.0751433656317975, + "grad_norm": 0.392578125, + "learning_rate": 0.00019261116242248318, + "loss": 0.0061, + "step": 285 + }, + { + "epoch": 0.07646167029200449, + "grad_norm": 1.9140625, + "learning_rate": 0.00019247921889431325, + "loss": 0.0497, + "step": 290 + }, + { + "epoch": 0.07777997495221145, + "grad_norm": 0.08837890625, + "learning_rate": 0.0001923472753661433, + "loss": 0.0573, + "step": 295 + }, + { + "epoch": 0.07909827961241843, + "grad_norm": 1.046875, + "learning_rate": 0.00019221533183797336, + "loss": 0.0528, + "step": 300 + }, + { + "epoch": 0.08041658427262541, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001920833883098034, + "loss": 0.0506, + "step": 305 + }, + { + "epoch": 0.08173488893283237, + "grad_norm": 0.08203125, + "learning_rate": 0.00019195144478163347, + "loss": 0.0307, + "step": 310 + }, + { + "epoch": 0.08305319359303935, + "grad_norm": 0.111328125, + "learning_rate": 0.00019181950125346354, + "loss": 0.0365, + "step": 315 + }, + { + "epoch": 0.08437149825324633, + "grad_norm": 1.2890625, + "learning_rate": 0.00019168755772529358, + "loss": 0.0447, + "step": 320 + }, + { + "epoch": 0.0856898029134533, + "grad_norm": 0.6015625, + "learning_rate": 0.00019155561419712365, + "loss": 0.0605, + "step": 325 + }, + { + "epoch": 0.08700810757366027, + "grad_norm": 0.71875, + "learning_rate": 0.0001914236706689537, + "loss": 0.0846, + "step": 330 + }, + { + "epoch": 0.08832641223386725, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019129172714078376, + "loss": 0.0713, + "step": 335 + }, + { + "epoch": 0.08964471689407422, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001911597836126138, + "loss": 0.0826, + "step": 340 + }, + { + "epoch": 0.0909630215542812, + "grad_norm": 2.203125, + "learning_rate": 0.00019102784008444388, + "loss": 0.0441, + "step": 345 + }, + { + "epoch": 0.09228132621448817, + "grad_norm": 1.21875, + "learning_rate": 0.00019089589655627395, + "loss": 0.1378, + "step": 350 + }, + { + "epoch": 0.09359963087469514, + "grad_norm": 3.0625, + "learning_rate": 0.00019076395302810396, + "loss": 0.1552, + "step": 355 + }, + { + "epoch": 0.09491793553490212, + "grad_norm": 0.232421875, + "learning_rate": 0.00019063200949993403, + "loss": 0.0458, + "step": 360 + }, + { + "epoch": 0.0962362401951091, + "grad_norm": 0.71875, + "learning_rate": 0.0001905000659717641, + "loss": 0.0312, + "step": 365 + }, + { + "epoch": 0.09755454485531606, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00019036812244359414, + "loss": 0.0247, + "step": 370 + }, + { + "epoch": 0.09887284951552304, + "grad_norm": 0.064453125, + "learning_rate": 0.0001902361789154242, + "loss": 0.054, + "step": 375 + }, + { + "epoch": 0.10019115417573, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019010423538725425, + "loss": 0.0023, + "step": 380 + }, + { + "epoch": 0.10150945883593698, + "grad_norm": 0.0361328125, + "learning_rate": 0.00018997229185908432, + "loss": 0.0884, + "step": 385 + }, + { + "epoch": 0.10282776349614396, + "grad_norm": 1.703125, + "learning_rate": 0.00018984034833091436, + "loss": 0.0506, + "step": 390 + }, + { + "epoch": 0.10414606815635093, + "grad_norm": 0.08837890625, + "learning_rate": 0.00018970840480274443, + "loss": 0.1123, + "step": 395 + }, + { + "epoch": 0.1054643728165579, + "grad_norm": 0.6953125, + "learning_rate": 0.0001895764612745745, + "loss": 0.0597, + "step": 400 + }, + { + "epoch": 0.10678267747676488, + "grad_norm": 0.18359375, + "learning_rate": 0.00018944451774640454, + "loss": 0.0138, + "step": 405 + }, + { + "epoch": 0.10810098213697185, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001893125742182346, + "loss": 0.0249, + "step": 410 + }, + { + "epoch": 0.10941928679717883, + "grad_norm": 0.00970458984375, + "learning_rate": 0.00018918063069006466, + "loss": 0.0084, + "step": 415 + }, + { + "epoch": 0.1107375914573858, + "grad_norm": 0.54296875, + "learning_rate": 0.00018904868716189472, + "loss": 0.0541, + "step": 420 + }, + { + "epoch": 0.11205589611759277, + "grad_norm": 0.74609375, + "learning_rate": 0.00018891674363372477, + "loss": 0.007, + "step": 425 + }, + { + "epoch": 0.11337420077779975, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00018878480010555484, + "loss": 0.0875, + "step": 430 + }, + { + "epoch": 0.11469250543800673, + "grad_norm": 0.9296875, + "learning_rate": 0.0001886528565773849, + "loss": 0.1207, + "step": 435 + }, + { + "epoch": 0.11601081009821369, + "grad_norm": 1.2734375, + "learning_rate": 0.00018852091304921495, + "loss": 0.1143, + "step": 440 + }, + { + "epoch": 0.11732911475842067, + "grad_norm": 0.6484375, + "learning_rate": 0.00018838896952104502, + "loss": 0.0393, + "step": 445 + }, + { + "epoch": 0.11864741941862765, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018825702599287506, + "loss": 0.02, + "step": 450 + }, + { + "epoch": 0.11996572407883462, + "grad_norm": 0.486328125, + "learning_rate": 0.0001881250824647051, + "loss": 0.0891, + "step": 455 + }, + { + "epoch": 0.1212840287390416, + "grad_norm": 1.0, + "learning_rate": 0.00018799313893653517, + "loss": 0.0469, + "step": 460 + }, + { + "epoch": 0.12260233339924857, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001878611954083652, + "loss": 0.019, + "step": 465 + }, + { + "epoch": 0.12392063805945554, + "grad_norm": 0.03857421875, + "learning_rate": 0.00018772925188019528, + "loss": 0.007, + "step": 470 + }, + { + "epoch": 0.12523894271966252, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00018759730835202532, + "loss": 0.0039, + "step": 475 + }, + { + "epoch": 0.12655724737986948, + "grad_norm": 0.014404296875, + "learning_rate": 0.0001874653648238554, + "loss": 0.0043, + "step": 480 + }, + { + "epoch": 0.12787555204007647, + "grad_norm": 0.51953125, + "learning_rate": 0.00018733342129568546, + "loss": 0.1326, + "step": 485 + }, + { + "epoch": 0.12919385670028344, + "grad_norm": 0.99609375, + "learning_rate": 0.0001872014777675155, + "loss": 0.0369, + "step": 490 + }, + { + "epoch": 0.1305121613604904, + "grad_norm": 0.2734375, + "learning_rate": 0.00018706953423934557, + "loss": 0.0395, + "step": 495 + }, + { + "epoch": 0.1318304660206974, + "grad_norm": 0.083984375, + "learning_rate": 0.00018693759071117561, + "loss": 0.0284, + "step": 500 + }, + { + "epoch": 0.1318304660206974, + "eval_loss": 0.04542969539761543, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.5293, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 3.685, + "step": 500 + }, + { + "epoch": 0.13314877068090436, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00018680564718300568, + "loss": 0.0533, + "step": 505 + }, + { + "epoch": 0.13446707534111133, + "grad_norm": 0.71484375, + "learning_rate": 0.00018667370365483575, + "loss": 0.0183, + "step": 510 + }, + { + "epoch": 0.13578538000131832, + "grad_norm": 0.018798828125, + "learning_rate": 0.0001865417601266658, + "loss": 0.0473, + "step": 515 + }, + { + "epoch": 0.13710368466152528, + "grad_norm": 0.388671875, + "learning_rate": 0.00018640981659849586, + "loss": 0.0562, + "step": 520 + }, + { + "epoch": 0.13842198932173225, + "grad_norm": 0.77734375, + "learning_rate": 0.0001862778730703259, + "loss": 0.0755, + "step": 525 + }, + { + "epoch": 0.13974029398193924, + "grad_norm": 2.8125, + "learning_rate": 0.00018614592954215598, + "loss": 0.0422, + "step": 530 + }, + { + "epoch": 0.1410585986421462, + "grad_norm": 0.48828125, + "learning_rate": 0.00018601398601398602, + "loss": 0.0882, + "step": 535 + }, + { + "epoch": 0.14237690330235317, + "grad_norm": 0.16015625, + "learning_rate": 0.0001858820424858161, + "loss": 0.0131, + "step": 540 + }, + { + "epoch": 0.14369520796256013, + "grad_norm": 0.31640625, + "learning_rate": 0.00018575009895764616, + "loss": 0.03, + "step": 545 + }, + { + "epoch": 0.14501351262276713, + "grad_norm": 0.0120849609375, + "learning_rate": 0.0001856181554294762, + "loss": 0.0425, + "step": 550 + }, + { + "epoch": 0.1463318172829741, + "grad_norm": 0.390625, + "learning_rate": 0.00018548621190130624, + "loss": 0.011, + "step": 555 + }, + { + "epoch": 0.14765012194318106, + "grad_norm": 1.9609375, + "learning_rate": 0.0001853542683731363, + "loss": 0.0807, + "step": 560 + }, + { + "epoch": 0.14896842660338805, + "grad_norm": 0.609375, + "learning_rate": 0.00018522232484496635, + "loss": 0.0278, + "step": 565 + }, + { + "epoch": 0.150286731263595, + "grad_norm": 0.087890625, + "learning_rate": 0.00018509038131679642, + "loss": 0.0484, + "step": 570 + }, + { + "epoch": 0.15160503592380198, + "grad_norm": 0.5078125, + "learning_rate": 0.00018495843778862646, + "loss": 0.1277, + "step": 575 + }, + { + "epoch": 0.15292334058400897, + "grad_norm": 0.8125, + "learning_rate": 0.00018482649426045653, + "loss": 0.058, + "step": 580 + }, + { + "epoch": 0.15424164524421594, + "grad_norm": 0.22265625, + "learning_rate": 0.00018469455073228657, + "loss": 0.0259, + "step": 585 + }, + { + "epoch": 0.1555599499044229, + "grad_norm": 1.8984375, + "learning_rate": 0.00018456260720411664, + "loss": 0.113, + "step": 590 + }, + { + "epoch": 0.1568782545646299, + "grad_norm": 0.12451171875, + "learning_rate": 0.0001844306636759467, + "loss": 0.0312, + "step": 595 + }, + { + "epoch": 0.15819655922483686, + "grad_norm": 0.0322265625, + "learning_rate": 0.00018429872014777676, + "loss": 0.0476, + "step": 600 + }, + { + "epoch": 0.15951486388504382, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00018416677661960682, + "loss": 0.0232, + "step": 605 + }, + { + "epoch": 0.16083316854525082, + "grad_norm": 0.57421875, + "learning_rate": 0.00018403483309143687, + "loss": 0.1287, + "step": 610 + }, + { + "epoch": 0.16215147320545778, + "grad_norm": 0.765625, + "learning_rate": 0.00018390288956326694, + "loss": 0.0991, + "step": 615 + }, + { + "epoch": 0.16346977786566474, + "grad_norm": 0.3125, + "learning_rate": 0.00018377094603509698, + "loss": 0.0247, + "step": 620 + }, + { + "epoch": 0.16478808252587174, + "grad_norm": 0.37890625, + "learning_rate": 0.00018363900250692705, + "loss": 0.0632, + "step": 625 + }, + { + "epoch": 0.1661063871860787, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018350705897875712, + "loss": 0.0314, + "step": 630 + }, + { + "epoch": 0.16742469184628567, + "grad_norm": 0.0673828125, + "learning_rate": 0.00018337511545058716, + "loss": 0.0425, + "step": 635 + }, + { + "epoch": 0.16874299650649266, + "grad_norm": 0.396484375, + "learning_rate": 0.00018324317192241723, + "loss": 0.0613, + "step": 640 + }, + { + "epoch": 0.17006130116669962, + "grad_norm": 0.057373046875, + "learning_rate": 0.00018311122839424727, + "loss": 0.0569, + "step": 645 + }, + { + "epoch": 0.1713796058269066, + "grad_norm": 0.001373291015625, + "learning_rate": 0.00018297928486607734, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 0.17269791048711358, + "grad_norm": 1.0859375, + "learning_rate": 0.00018284734133790738, + "loss": 0.0189, + "step": 655 + }, + { + "epoch": 0.17401621514732055, + "grad_norm": 0.6015625, + "learning_rate": 0.00018271539780973742, + "loss": 0.0601, + "step": 660 + }, + { + "epoch": 0.1753345198075275, + "grad_norm": 0.25390625, + "learning_rate": 0.0001825834542815675, + "loss": 0.0211, + "step": 665 + }, + { + "epoch": 0.1766528244677345, + "grad_norm": 2.6875, + "learning_rate": 0.00018245151075339753, + "loss": 0.0713, + "step": 670 + }, + { + "epoch": 0.17797112912794147, + "grad_norm": 1.1875, + "learning_rate": 0.0001823195672252276, + "loss": 0.0522, + "step": 675 + }, + { + "epoch": 0.17928943378814843, + "grad_norm": 0.025146484375, + "learning_rate": 0.00018218762369705767, + "loss": 0.0242, + "step": 680 + }, + { + "epoch": 0.18060773844835543, + "grad_norm": 0.048095703125, + "learning_rate": 0.00018205568016888772, + "loss": 0.0129, + "step": 685 + }, + { + "epoch": 0.1819260431085624, + "grad_norm": 0.04541015625, + "learning_rate": 0.00018192373664071778, + "loss": 0.0142, + "step": 690 + }, + { + "epoch": 0.18324434776876936, + "grad_norm": 0.00830078125, + "learning_rate": 0.00018179179311254783, + "loss": 0.0121, + "step": 695 + }, + { + "epoch": 0.18456265242897635, + "grad_norm": 0.53125, + "learning_rate": 0.0001816598495843779, + "loss": 0.0163, + "step": 700 + }, + { + "epoch": 0.1858809570891833, + "grad_norm": 0.185546875, + "learning_rate": 0.00018152790605620796, + "loss": 0.0203, + "step": 705 + }, + { + "epoch": 0.18719926174939028, + "grad_norm": 1.2578125, + "learning_rate": 0.000181395962528038, + "loss": 0.1548, + "step": 710 + }, + { + "epoch": 0.18851756640959727, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00018126401899986808, + "loss": 0.0543, + "step": 715 + }, + { + "epoch": 0.18983587106980424, + "grad_norm": 0.07568359375, + "learning_rate": 0.00018113207547169812, + "loss": 0.0346, + "step": 720 + }, + { + "epoch": 0.1911541757300112, + "grad_norm": 0.1318359375, + "learning_rate": 0.0001810001319435282, + "loss": 0.03, + "step": 725 + }, + { + "epoch": 0.1924724803902182, + "grad_norm": 0.1455078125, + "learning_rate": 0.00018086818841535823, + "loss": 0.0796, + "step": 730 + }, + { + "epoch": 0.19379078505042516, + "grad_norm": 0.09814453125, + "learning_rate": 0.0001807362448871883, + "loss": 0.0662, + "step": 735 + }, + { + "epoch": 0.19510908971063212, + "grad_norm": 0.91015625, + "learning_rate": 0.00018060430135901837, + "loss": 0.0675, + "step": 740 + }, + { + "epoch": 0.19642739437083911, + "grad_norm": 0.10693359375, + "learning_rate": 0.0001804723578308484, + "loss": 0.0377, + "step": 745 + }, + { + "epoch": 0.19774569903104608, + "grad_norm": 0.95703125, + "learning_rate": 0.00018034041430267848, + "loss": 0.0174, + "step": 750 + }, + { + "epoch": 0.19906400369125304, + "grad_norm": 1.7890625, + "learning_rate": 0.00018020847077450852, + "loss": 0.0278, + "step": 755 + }, + { + "epoch": 0.20038230835146, + "grad_norm": 0.8515625, + "learning_rate": 0.00018007652724633856, + "loss": 0.0113, + "step": 760 + }, + { + "epoch": 0.201700613011667, + "grad_norm": 0.016845703125, + "learning_rate": 0.00017994458371816863, + "loss": 0.0589, + "step": 765 + }, + { + "epoch": 0.20301891767187397, + "grad_norm": 0.01043701171875, + "learning_rate": 0.00017981264018999867, + "loss": 0.0203, + "step": 770 + }, + { + "epoch": 0.20433722233208093, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00017968069666182874, + "loss": 0.0494, + "step": 775 + }, + { + "epoch": 0.20565552699228792, + "grad_norm": 0.56640625, + "learning_rate": 0.00017954875313365879, + "loss": 0.0394, + "step": 780 + }, + { + "epoch": 0.2069738316524949, + "grad_norm": 0.06591796875, + "learning_rate": 0.00017941680960548886, + "loss": 0.0848, + "step": 785 + }, + { + "epoch": 0.20829213631270185, + "grad_norm": 0.40234375, + "learning_rate": 0.00017928486607731892, + "loss": 0.0464, + "step": 790 + }, + { + "epoch": 0.20961044097290885, + "grad_norm": 0.06298828125, + "learning_rate": 0.00017915292254914897, + "loss": 0.0222, + "step": 795 + }, + { + "epoch": 0.2109287456331158, + "grad_norm": 0.5390625, + "learning_rate": 0.00017902097902097904, + "loss": 0.0434, + "step": 800 + }, + { + "epoch": 0.21224705029332278, + "grad_norm": 1.390625, + "learning_rate": 0.00017888903549280908, + "loss": 0.0222, + "step": 805 + }, + { + "epoch": 0.21356535495352977, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00017875709196463915, + "loss": 0.0099, + "step": 810 + }, + { + "epoch": 0.21488365961373673, + "grad_norm": 0.10009765625, + "learning_rate": 0.0001786251484364692, + "loss": 0.0086, + "step": 815 + }, + { + "epoch": 0.2162019642739437, + "grad_norm": 0.06396484375, + "learning_rate": 0.00017849320490829926, + "loss": 0.0715, + "step": 820 + }, + { + "epoch": 0.2175202689341507, + "grad_norm": 0.365234375, + "learning_rate": 0.00017836126138012933, + "loss": 0.0642, + "step": 825 + }, + { + "epoch": 0.21883857359435765, + "grad_norm": 0.01519775390625, + "learning_rate": 0.00017822931785195937, + "loss": 0.0111, + "step": 830 + }, + { + "epoch": 0.22015687825456462, + "grad_norm": 1.1640625, + "learning_rate": 0.00017809737432378944, + "loss": 0.0518, + "step": 835 + }, + { + "epoch": 0.2214751829147716, + "grad_norm": 0.00921630859375, + "learning_rate": 0.00017796543079561948, + "loss": 0.0384, + "step": 840 + }, + { + "epoch": 0.22279348757497858, + "grad_norm": 0.33984375, + "learning_rate": 0.00017783348726744955, + "loss": 0.0204, + "step": 845 + }, + { + "epoch": 0.22411179223518554, + "grad_norm": 0.294921875, + "learning_rate": 0.00017770154373927962, + "loss": 0.0075, + "step": 850 + }, + { + "epoch": 0.22543009689539253, + "grad_norm": 0.033203125, + "learning_rate": 0.00017756960021110963, + "loss": 0.0895, + "step": 855 + }, + { + "epoch": 0.2267484015555995, + "grad_norm": 0.08056640625, + "learning_rate": 0.0001774376566829397, + "loss": 0.1039, + "step": 860 + }, + { + "epoch": 0.22806670621580646, + "grad_norm": 0.55078125, + "learning_rate": 0.00017730571315476975, + "loss": 0.0125, + "step": 865 + }, + { + "epoch": 0.22938501087601346, + "grad_norm": 0.5859375, + "learning_rate": 0.00017717376962659982, + "loss": 0.0381, + "step": 870 + }, + { + "epoch": 0.23070331553622042, + "grad_norm": 0.029052734375, + "learning_rate": 0.00017704182609842988, + "loss": 0.0434, + "step": 875 + }, + { + "epoch": 0.23202162019642739, + "grad_norm": 0.43359375, + "learning_rate": 0.00017690988257025993, + "loss": 0.0799, + "step": 880 + }, + { + "epoch": 0.23333992485663438, + "grad_norm": 0.04150390625, + "learning_rate": 0.00017677793904209, + "loss": 0.0692, + "step": 885 + }, + { + "epoch": 0.23465822951684134, + "grad_norm": 0.435546875, + "learning_rate": 0.00017664599551392004, + "loss": 0.0544, + "step": 890 + }, + { + "epoch": 0.2359765341770483, + "grad_norm": 1.171875, + "learning_rate": 0.0001765140519857501, + "loss": 0.0619, + "step": 895 + }, + { + "epoch": 0.2372948388372553, + "grad_norm": 0.01263427734375, + "learning_rate": 0.00017638210845758018, + "loss": 0.0418, + "step": 900 + }, + { + "epoch": 0.23861314349746227, + "grad_norm": 0.017578125, + "learning_rate": 0.00017625016492941022, + "loss": 0.0195, + "step": 905 + }, + { + "epoch": 0.23993144815766923, + "grad_norm": 0.6171875, + "learning_rate": 0.0001761182214012403, + "loss": 0.067, + "step": 910 + }, + { + "epoch": 0.24124975281787622, + "grad_norm": 0.59765625, + "learning_rate": 0.00017598627787307033, + "loss": 0.049, + "step": 915 + }, + { + "epoch": 0.2425680574780832, + "grad_norm": 1.2421875, + "learning_rate": 0.0001758543343449004, + "loss": 0.0539, + "step": 920 + }, + { + "epoch": 0.24388636213829015, + "grad_norm": 0.10302734375, + "learning_rate": 0.00017572239081673044, + "loss": 0.0725, + "step": 925 + }, + { + "epoch": 0.24520466679849715, + "grad_norm": 0.330078125, + "learning_rate": 0.0001755904472885605, + "loss": 0.064, + "step": 930 + }, + { + "epoch": 0.2465229714587041, + "grad_norm": 0.220703125, + "learning_rate": 0.00017545850376039058, + "loss": 0.0271, + "step": 935 + }, + { + "epoch": 0.24784127611891107, + "grad_norm": 0.01470947265625, + "learning_rate": 0.00017532656023222062, + "loss": 0.0247, + "step": 940 + }, + { + "epoch": 0.24915958077911807, + "grad_norm": 0.013427734375, + "learning_rate": 0.0001751946167040507, + "loss": 0.017, + "step": 945 + }, + { + "epoch": 0.25047788543932503, + "grad_norm": 0.58984375, + "learning_rate": 0.00017506267317588073, + "loss": 0.0254, + "step": 950 + }, + { + "epoch": 0.251796190099532, + "grad_norm": 0.412109375, + "learning_rate": 0.00017493072964771078, + "loss": 0.0186, + "step": 955 + }, + { + "epoch": 0.25311449475973896, + "grad_norm": 0.66796875, + "learning_rate": 0.00017479878611954084, + "loss": 0.0617, + "step": 960 + }, + { + "epoch": 0.25443279941994595, + "grad_norm": 0.322265625, + "learning_rate": 0.00017466684259137089, + "loss": 0.0173, + "step": 965 + }, + { + "epoch": 0.25575110408015295, + "grad_norm": 0.83203125, + "learning_rate": 0.00017453489906320096, + "loss": 0.0512, + "step": 970 + }, + { + "epoch": 0.2570694087403599, + "grad_norm": 0.08447265625, + "learning_rate": 0.000174402955535031, + "loss": 0.0361, + "step": 975 + }, + { + "epoch": 0.2583877134005669, + "grad_norm": 0.423828125, + "learning_rate": 0.00017427101200686107, + "loss": 0.0175, + "step": 980 + }, + { + "epoch": 0.25970601806077387, + "grad_norm": 0.77734375, + "learning_rate": 0.00017413906847869114, + "loss": 0.0139, + "step": 985 + }, + { + "epoch": 0.2610243227209808, + "grad_norm": 0.515625, + "learning_rate": 0.00017400712495052118, + "loss": 0.0948, + "step": 990 + }, + { + "epoch": 0.2623426273811878, + "grad_norm": 1.421875, + "learning_rate": 0.00017387518142235125, + "loss": 0.0406, + "step": 995 + }, + { + "epoch": 0.2636609320413948, + "grad_norm": 0.058837890625, + "learning_rate": 0.0001737432378941813, + "loss": 0.1011, + "step": 1000 + }, + { + "epoch": 0.2636609320413948, + "eval_loss": 0.045552924275398254, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.6113, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 3.684, + "step": 1000 + }, + { + "epoch": 0.26497923670160173, + "grad_norm": 0.380859375, + "learning_rate": 0.00017361129436601136, + "loss": 0.0711, + "step": 1005 + }, + { + "epoch": 0.2662975413618087, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00017347935083784143, + "loss": 0.0218, + "step": 1010 + }, + { + "epoch": 0.2676158460220157, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017334740730967147, + "loss": 0.0301, + "step": 1015 + }, + { + "epoch": 0.26893415068222265, + "grad_norm": 0.2734375, + "learning_rate": 0.00017321546378150154, + "loss": 0.0721, + "step": 1020 + }, + { + "epoch": 0.27025245534242964, + "grad_norm": 0.25390625, + "learning_rate": 0.00017308352025333158, + "loss": 0.0363, + "step": 1025 + }, + { + "epoch": 0.27157076000263664, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017295157672516165, + "loss": 0.0313, + "step": 1030 + }, + { + "epoch": 0.2728890646628436, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0001728196331969917, + "loss": 0.0385, + "step": 1035 + }, + { + "epoch": 0.27420736932305056, + "grad_norm": 0.00787353515625, + "learning_rate": 0.00017268768966882176, + "loss": 0.0405, + "step": 1040 + }, + { + "epoch": 0.27552567398325756, + "grad_norm": 0.484375, + "learning_rate": 0.00017255574614065183, + "loss": 0.0616, + "step": 1045 + }, + { + "epoch": 0.2768439786434645, + "grad_norm": 0.0908203125, + "learning_rate": 0.00017242380261248185, + "loss": 0.0057, + "step": 1050 + }, + { + "epoch": 0.2781622833036715, + "grad_norm": 0.1904296875, + "learning_rate": 0.00017229185908431192, + "loss": 0.0417, + "step": 1055 + }, + { + "epoch": 0.2794805879638785, + "grad_norm": 0.30078125, + "learning_rate": 0.00017215991555614196, + "loss": 0.0346, + "step": 1060 + }, + { + "epoch": 0.2807988926240854, + "grad_norm": 0.016357421875, + "learning_rate": 0.00017202797202797203, + "loss": 0.0295, + "step": 1065 + }, + { + "epoch": 0.2821171972842924, + "grad_norm": 0.490234375, + "learning_rate": 0.0001718960284998021, + "loss": 0.0448, + "step": 1070 + }, + { + "epoch": 0.28343550194449935, + "grad_norm": 0.004241943359375, + "learning_rate": 0.00017176408497163214, + "loss": 0.0051, + "step": 1075 + }, + { + "epoch": 0.28475380660470634, + "grad_norm": 0.01904296875, + "learning_rate": 0.0001716321414434622, + "loss": 0.0894, + "step": 1080 + }, + { + "epoch": 0.28607211126491333, + "grad_norm": 0.83984375, + "learning_rate": 0.00017150019791529225, + "loss": 0.0288, + "step": 1085 + }, + { + "epoch": 0.28739041592512027, + "grad_norm": 0.2021484375, + "learning_rate": 0.00017136825438712232, + "loss": 0.0222, + "step": 1090 + }, + { + "epoch": 0.28870872058532726, + "grad_norm": 0.322265625, + "learning_rate": 0.0001712363108589524, + "loss": 0.0444, + "step": 1095 + }, + { + "epoch": 0.29002702524553425, + "grad_norm": 0.408203125, + "learning_rate": 0.00017110436733078243, + "loss": 0.0828, + "step": 1100 + }, + { + "epoch": 0.2913453299057412, + "grad_norm": 0.04052734375, + "learning_rate": 0.0001709724238026125, + "loss": 0.0725, + "step": 1105 + }, + { + "epoch": 0.2926636345659482, + "grad_norm": 0.2578125, + "learning_rate": 0.00017084048027444254, + "loss": 0.0204, + "step": 1110 + }, + { + "epoch": 0.2939819392261552, + "grad_norm": 0.67578125, + "learning_rate": 0.0001707085367462726, + "loss": 0.0503, + "step": 1115 + }, + { + "epoch": 0.2953002438863621, + "grad_norm": 0.0059814453125, + "learning_rate": 0.00017057659321810265, + "loss": 0.0144, + "step": 1120 + }, + { + "epoch": 0.2966185485465691, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00017044464968993272, + "loss": 0.0044, + "step": 1125 + }, + { + "epoch": 0.2979368532067761, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001703127061617628, + "loss": 0.013, + "step": 1130 + }, + { + "epoch": 0.29925515786698303, + "grad_norm": 0.287109375, + "learning_rate": 0.00017018076263359283, + "loss": 0.0245, + "step": 1135 + }, + { + "epoch": 0.30057346252719, + "grad_norm": 0.26171875, + "learning_rate": 0.0001700488191054229, + "loss": 0.0247, + "step": 1140 + }, + { + "epoch": 0.301891767187397, + "grad_norm": 0.40625, + "learning_rate": 0.00016991687557725294, + "loss": 0.0402, + "step": 1145 + }, + { + "epoch": 0.30321007184760396, + "grad_norm": 1.2578125, + "learning_rate": 0.000169784932049083, + "loss": 0.0071, + "step": 1150 + }, + { + "epoch": 0.30452837650781095, + "grad_norm": 0.330078125, + "learning_rate": 0.00016965298852091306, + "loss": 0.0177, + "step": 1155 + }, + { + "epoch": 0.30584668116801794, + "grad_norm": 0.07275390625, + "learning_rate": 0.0001695210449927431, + "loss": 0.0029, + "step": 1160 + }, + { + "epoch": 0.3071649858282249, + "grad_norm": 0.455078125, + "learning_rate": 0.00016938910146457317, + "loss": 0.0262, + "step": 1165 + }, + { + "epoch": 0.30848329048843187, + "grad_norm": 0.002655029296875, + "learning_rate": 0.0001692571579364032, + "loss": 0.0346, + "step": 1170 + }, + { + "epoch": 0.30980159514863886, + "grad_norm": 0.1748046875, + "learning_rate": 0.00016912521440823328, + "loss": 0.0494, + "step": 1175 + }, + { + "epoch": 0.3111198998088458, + "grad_norm": 1.4609375, + "learning_rate": 0.00016899327088006335, + "loss": 0.0603, + "step": 1180 + }, + { + "epoch": 0.3124382044690528, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001688613273518934, + "loss": 0.0366, + "step": 1185 + }, + { + "epoch": 0.3137565091292598, + "grad_norm": 0.01422119140625, + "learning_rate": 0.00016872938382372346, + "loss": 0.0678, + "step": 1190 + }, + { + "epoch": 0.3150748137894667, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001685974402955535, + "loss": 0.0359, + "step": 1195 + }, + { + "epoch": 0.3163931184496737, + "grad_norm": 0.275390625, + "learning_rate": 0.00016846549676738357, + "loss": 0.1099, + "step": 1200 + }, + { + "epoch": 0.3177114231098807, + "grad_norm": 0.212890625, + "learning_rate": 0.00016833355323921364, + "loss": 0.0343, + "step": 1205 + }, + { + "epoch": 0.31902972777008765, + "grad_norm": 0.0302734375, + "learning_rate": 0.00016820160971104368, + "loss": 0.0138, + "step": 1210 + }, + { + "epoch": 0.32034803243029464, + "grad_norm": 0.016845703125, + "learning_rate": 0.00016806966618287375, + "loss": 0.0202, + "step": 1215 + }, + { + "epoch": 0.32166633709050163, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001679377226547038, + "loss": 0.0442, + "step": 1220 + }, + { + "epoch": 0.32298464175070857, + "grad_norm": 0.049072265625, + "learning_rate": 0.00016780577912653386, + "loss": 0.0375, + "step": 1225 + }, + { + "epoch": 0.32430294641091556, + "grad_norm": 0.1337890625, + "learning_rate": 0.0001676738355983639, + "loss": 0.01, + "step": 1230 + }, + { + "epoch": 0.32562125107112255, + "grad_norm": 0.02197265625, + "learning_rate": 0.00016754189207019397, + "loss": 0.0139, + "step": 1235 + }, + { + "epoch": 0.3269395557313295, + "grad_norm": 0.09228515625, + "learning_rate": 0.00016740994854202404, + "loss": 0.014, + "step": 1240 + }, + { + "epoch": 0.3282578603915365, + "grad_norm": 0.47265625, + "learning_rate": 0.00016727800501385408, + "loss": 0.1546, + "step": 1245 + }, + { + "epoch": 0.3295761650517435, + "grad_norm": 0.02294921875, + "learning_rate": 0.00016714606148568413, + "loss": 0.0803, + "step": 1250 + }, + { + "epoch": 0.3308944697119504, + "grad_norm": 0.185546875, + "learning_rate": 0.00016701411795751417, + "loss": 0.0376, + "step": 1255 + }, + { + "epoch": 0.3322127743721574, + "grad_norm": 0.1123046875, + "learning_rate": 0.00016688217442934424, + "loss": 0.0375, + "step": 1260 + }, + { + "epoch": 0.3335310790323644, + "grad_norm": 1.03125, + "learning_rate": 0.0001667502309011743, + "loss": 0.0442, + "step": 1265 + }, + { + "epoch": 0.33484938369257133, + "grad_norm": 0.0172119140625, + "learning_rate": 0.00016661828737300435, + "loss": 0.0261, + "step": 1270 + }, + { + "epoch": 0.3361676883527783, + "grad_norm": 0.42578125, + "learning_rate": 0.00016648634384483442, + "loss": 0.0553, + "step": 1275 + }, + { + "epoch": 0.3374859930129853, + "grad_norm": 0.1328125, + "learning_rate": 0.00016635440031666446, + "loss": 0.0065, + "step": 1280 + }, + { + "epoch": 0.33880429767319226, + "grad_norm": 0.263671875, + "learning_rate": 0.00016622245678849453, + "loss": 0.0527, + "step": 1285 + }, + { + "epoch": 0.34012260233339925, + "grad_norm": 0.314453125, + "learning_rate": 0.0001660905132603246, + "loss": 0.0297, + "step": 1290 + }, + { + "epoch": 0.34144090699360624, + "grad_norm": 0.04345703125, + "learning_rate": 0.00016595856973215464, + "loss": 0.0477, + "step": 1295 + }, + { + "epoch": 0.3427592116538132, + "grad_norm": 0.08154296875, + "learning_rate": 0.0001658266262039847, + "loss": 0.0298, + "step": 1300 + }, + { + "epoch": 0.34407751631402017, + "grad_norm": 0.08935546875, + "learning_rate": 0.00016569468267581475, + "loss": 0.0481, + "step": 1305 + }, + { + "epoch": 0.34539582097422716, + "grad_norm": 0.06640625, + "learning_rate": 0.00016556273914764482, + "loss": 0.0153, + "step": 1310 + }, + { + "epoch": 0.3467141256344341, + "grad_norm": 0.00592041015625, + "learning_rate": 0.00016543079561947486, + "loss": 0.0111, + "step": 1315 + }, + { + "epoch": 0.3480324302946411, + "grad_norm": 0.2236328125, + "learning_rate": 0.00016529885209130493, + "loss": 0.0309, + "step": 1320 + }, + { + "epoch": 0.3493507349548481, + "grad_norm": 0.0198974609375, + "learning_rate": 0.000165166908563135, + "loss": 0.0579, + "step": 1325 + }, + { + "epoch": 0.350669039615055, + "grad_norm": 0.10107421875, + "learning_rate": 0.00016503496503496504, + "loss": 0.0055, + "step": 1330 + }, + { + "epoch": 0.351987344275262, + "grad_norm": 0.71875, + "learning_rate": 0.00016490302150679511, + "loss": 0.0299, + "step": 1335 + }, + { + "epoch": 0.353305648935469, + "grad_norm": 0.01348876953125, + "learning_rate": 0.00016477107797862516, + "loss": 0.0943, + "step": 1340 + }, + { + "epoch": 0.35462395359567594, + "grad_norm": 0.3046875, + "learning_rate": 0.00016463913445045523, + "loss": 0.0216, + "step": 1345 + }, + { + "epoch": 0.35594225825588294, + "grad_norm": 0.02392578125, + "learning_rate": 0.00016450719092228527, + "loss": 0.0265, + "step": 1350 + }, + { + "epoch": 0.35726056291608993, + "grad_norm": 0.453125, + "learning_rate": 0.0001643752473941153, + "loss": 0.0539, + "step": 1355 + }, + { + "epoch": 0.35857886757629687, + "grad_norm": 0.00823974609375, + "learning_rate": 0.00016424330386594538, + "loss": 0.0139, + "step": 1360 + }, + { + "epoch": 0.35989717223650386, + "grad_norm": 0.55859375, + "learning_rate": 0.00016411136033777542, + "loss": 0.0428, + "step": 1365 + }, + { + "epoch": 0.36121547689671085, + "grad_norm": 0.052734375, + "learning_rate": 0.0001639794168096055, + "loss": 0.0346, + "step": 1370 + }, + { + "epoch": 0.3625337815569178, + "grad_norm": 0.12158203125, + "learning_rate": 0.00016384747328143556, + "loss": 0.0095, + "step": 1375 + }, + { + "epoch": 0.3638520862171248, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0001637155297532656, + "loss": 0.0224, + "step": 1380 + }, + { + "epoch": 0.3651703908773318, + "grad_norm": 0.01318359375, + "learning_rate": 0.00016358358622509567, + "loss": 0.0316, + "step": 1385 + }, + { + "epoch": 0.3664886955375387, + "grad_norm": 0.011962890625, + "learning_rate": 0.0001634516426969257, + "loss": 0.0051, + "step": 1390 + }, + { + "epoch": 0.3678070001977457, + "grad_norm": 0.00396728515625, + "learning_rate": 0.00016331969916875578, + "loss": 0.038, + "step": 1395 + }, + { + "epoch": 0.3691253048579527, + "grad_norm": 0.375, + "learning_rate": 0.00016318775564058585, + "loss": 0.029, + "step": 1400 + }, + { + "epoch": 0.37044360951815963, + "grad_norm": 0.265625, + "learning_rate": 0.0001630558121124159, + "loss": 0.0072, + "step": 1405 + }, + { + "epoch": 0.3717619141783666, + "grad_norm": 0.00127410888671875, + "learning_rate": 0.00016292386858424596, + "loss": 0.0381, + "step": 1410 + }, + { + "epoch": 0.3730802188385736, + "grad_norm": 1.15625, + "learning_rate": 0.000162791925056076, + "loss": 0.0573, + "step": 1415 + }, + { + "epoch": 0.37439852349878056, + "grad_norm": 0.0244140625, + "learning_rate": 0.00016265998152790607, + "loss": 0.051, + "step": 1420 + }, + { + "epoch": 0.37571682815898755, + "grad_norm": 0.0015106201171875, + "learning_rate": 0.00016252803799973612, + "loss": 0.0239, + "step": 1425 + }, + { + "epoch": 0.37703513281919454, + "grad_norm": 0.26953125, + "learning_rate": 0.00016239609447156618, + "loss": 0.0165, + "step": 1430 + }, + { + "epoch": 0.3783534374794015, + "grad_norm": 0.006134033203125, + "learning_rate": 0.00016226415094339625, + "loss": 0.0071, + "step": 1435 + }, + { + "epoch": 0.37967174213960847, + "grad_norm": 2.828125, + "learning_rate": 0.0001621322074152263, + "loss": 0.0272, + "step": 1440 + }, + { + "epoch": 0.38099004679981546, + "grad_norm": 0.349609375, + "learning_rate": 0.00016200026388705637, + "loss": 0.0647, + "step": 1445 + }, + { + "epoch": 0.3823083514600224, + "grad_norm": 0.09326171875, + "learning_rate": 0.00016186832035888638, + "loss": 0.0262, + "step": 1450 + }, + { + "epoch": 0.3836266561202294, + "grad_norm": 0.041015625, + "learning_rate": 0.00016173637683071645, + "loss": 0.0576, + "step": 1455 + }, + { + "epoch": 0.3849449607804364, + "grad_norm": 0.033935546875, + "learning_rate": 0.00016160443330254652, + "loss": 0.0142, + "step": 1460 + }, + { + "epoch": 0.3862632654406433, + "grad_norm": 0.09130859375, + "learning_rate": 0.00016147248977437656, + "loss": 0.0348, + "step": 1465 + }, + { + "epoch": 0.3875815701008503, + "grad_norm": 2.390625, + "learning_rate": 0.00016134054624620663, + "loss": 0.0672, + "step": 1470 + }, + { + "epoch": 0.3888998747610573, + "grad_norm": 0.439453125, + "learning_rate": 0.00016120860271803667, + "loss": 0.0121, + "step": 1475 + }, + { + "epoch": 0.39021817942126424, + "grad_norm": 0.1298828125, + "learning_rate": 0.00016107665918986674, + "loss": 0.0114, + "step": 1480 + }, + { + "epoch": 0.39153648408147124, + "grad_norm": 0.85546875, + "learning_rate": 0.0001609447156616968, + "loss": 0.0968, + "step": 1485 + }, + { + "epoch": 0.39285478874167823, + "grad_norm": 0.703125, + "learning_rate": 0.00016081277213352685, + "loss": 0.0349, + "step": 1490 + }, + { + "epoch": 0.39417309340188517, + "grad_norm": 0.021728515625, + "learning_rate": 0.00016068082860535692, + "loss": 0.0106, + "step": 1495 + }, + { + "epoch": 0.39549139806209216, + "grad_norm": 0.7265625, + "learning_rate": 0.00016054888507718696, + "loss": 0.0225, + "step": 1500 + }, + { + "epoch": 0.39549139806209216, + "eval_loss": 0.03515048325061798, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.3497, + "eval_samples_per_second": 7.373, + "eval_steps_per_second": 3.686, + "step": 1500 + } + ], + "logging_steps": 5, + "max_steps": 7584, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.210468921462825e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1500/training_args.bin b/checkpoint-1500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d172b090b85619dc1adf130a66b333f6491d2342 --- /dev/null +++ b/checkpoint-1500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cee8b68ab8cbec1968c4eec747022ea57cd350072097ab37179dd29309655d7 +size 5688 diff --git a/checkpoint-2000/README.md b/checkpoint-2000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e738b5eda9883f4a45efd9d37b8b8357ba758456 --- /dev/null +++ b/checkpoint-2000/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/Llama-3.2-3B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-2000/adapter_config.json b/checkpoint-2000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..852d02d4dc1204f4332be8b3363041a3d3e3feb3 --- /dev/null +++ b/checkpoint-2000/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/Llama-3.2-3B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "gate_proj", + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2000/adapter_model.safetensors b/checkpoint-2000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..afa000ed776b5369e3730017a1a2442782370f7d --- /dev/null +++ b/checkpoint-2000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7879a2046058809073bce1db45056566a43bc72d3a3abc6227d7ac61454497be +size 97307544 diff --git a/checkpoint-2000/optimizer.pt b/checkpoint-2000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..47884612cb927157fde078aa1d684bf79758325c --- /dev/null +++ b/checkpoint-2000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f10c469d028995fe33d6f97b402322e974d8ebaffe915876f1930af0de5a7c9 +size 50866370 diff --git a/checkpoint-2000/rng_state.pth b/checkpoint-2000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..aabf76f7fa00c9fd97c271e6e0be6d4f91d26c61 --- /dev/null +++ b/checkpoint-2000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd55c3b412ed63dda63d944e5148e12c72c1382a50017bf4d626548cc6ff19a6 +size 14244 diff --git a/checkpoint-2000/scheduler.pt b/checkpoint-2000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a36c020e396e761a56320dbfea16770286815548 --- /dev/null +++ b/checkpoint-2000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a02bc39f46f30e3cf91b0d3feeb294f6ed0b411fe847f9f462a0df4af85e4e0 +size 1064 diff --git a/checkpoint-2000/special_tokens_map.json b/checkpoint-2000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-2000/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-2000/tokenizer.json b/checkpoint-2000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-2000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-2000/tokenizer_config.json b/checkpoint-2000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfd835b0ef1033e89629af6e13ae45397e9fa2f8 --- /dev/null +++ b/checkpoint-2000/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 July 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-2000/trainer_state.json b/checkpoint-2000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..39eb8f75b6386275aed1556ba58893779f0db930 --- /dev/null +++ b/checkpoint-2000/trainer_state.json @@ -0,0 +1,2868 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5273218640827896, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001318304660206974, + "grad_norm": 4.59375, + "learning_rate": 0.0002, + "loss": 1.9624, + "step": 5 + }, + { + "epoch": 0.002636609320413948, + "grad_norm": 1.7421875, + "learning_rate": 0.00019986805647183008, + "loss": 0.6513, + "step": 10 + }, + { + "epoch": 0.003954913980620921, + "grad_norm": 1.84375, + "learning_rate": 0.00019973611294366012, + "loss": 0.1146, + "step": 15 + }, + { + "epoch": 0.005273218640827896, + "grad_norm": 1.3203125, + "learning_rate": 0.0001996041694154902, + "loss": 0.0529, + "step": 20 + }, + { + "epoch": 0.006591523301034869, + "grad_norm": 0.40234375, + "learning_rate": 0.00019947222588732023, + "loss": 0.1214, + "step": 25 + }, + { + "epoch": 0.007909827961241843, + "grad_norm": 1.5390625, + "learning_rate": 0.0001993402823591503, + "loss": 0.0919, + "step": 30 + }, + { + "epoch": 0.009228132621448816, + "grad_norm": 0.06201171875, + "learning_rate": 0.00019920833883098034, + "loss": 0.09, + "step": 35 + }, + { + "epoch": 0.010546437281655791, + "grad_norm": 1.53125, + "learning_rate": 0.0001990763953028104, + "loss": 0.1945, + "step": 40 + }, + { + "epoch": 0.011864741941862765, + "grad_norm": 0.2890625, + "learning_rate": 0.00019894445177464048, + "loss": 0.1259, + "step": 45 + }, + { + "epoch": 0.013183046602069738, + "grad_norm": 0.609375, + "learning_rate": 0.00019881250824647052, + "loss": 0.027, + "step": 50 + }, + { + "epoch": 0.014501351262276712, + "grad_norm": 0.369140625, + "learning_rate": 0.00019868056471830057, + "loss": 0.1068, + "step": 55 + }, + { + "epoch": 0.015819655922483685, + "grad_norm": 0.34765625, + "learning_rate": 0.00019854862119013064, + "loss": 0.0542, + "step": 60 + }, + { + "epoch": 0.01713796058269066, + "grad_norm": 0.055419921875, + "learning_rate": 0.00019841667766196068, + "loss": 0.0901, + "step": 65 + }, + { + "epoch": 0.018456265242897632, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00019828473413379075, + "loss": 0.0091, + "step": 70 + }, + { + "epoch": 0.019774569903104607, + "grad_norm": 0.0079345703125, + "learning_rate": 0.0001981527906056208, + "loss": 0.0744, + "step": 75 + }, + { + "epoch": 0.021092874563311582, + "grad_norm": 0.65234375, + "learning_rate": 0.00019802084707745086, + "loss": 0.1108, + "step": 80 + }, + { + "epoch": 0.022411179223518554, + "grad_norm": 0.50390625, + "learning_rate": 0.0001978889035492809, + "loss": 0.0446, + "step": 85 + }, + { + "epoch": 0.02372948388372553, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019775696002111097, + "loss": 0.0982, + "step": 90 + }, + { + "epoch": 0.0250477885439325, + "grad_norm": 0.490234375, + "learning_rate": 0.00019762501649294104, + "loss": 0.1035, + "step": 95 + }, + { + "epoch": 0.026366093204139476, + "grad_norm": 0.12158203125, + "learning_rate": 0.00019749307296477108, + "loss": 0.0401, + "step": 100 + }, + { + "epoch": 0.02768439786434645, + "grad_norm": 0.16015625, + "learning_rate": 0.00019736112943660115, + "loss": 0.0309, + "step": 105 + }, + { + "epoch": 0.029002702524553423, + "grad_norm": 1.359375, + "learning_rate": 0.0001972291859084312, + "loss": 0.1032, + "step": 110 + }, + { + "epoch": 0.0303210071847604, + "grad_norm": 0.52734375, + "learning_rate": 0.00019709724238026126, + "loss": 0.0811, + "step": 115 + }, + { + "epoch": 0.03163931184496737, + "grad_norm": 0.177734375, + "learning_rate": 0.00019696529885209133, + "loss": 0.0258, + "step": 120 + }, + { + "epoch": 0.03295761650517435, + "grad_norm": 0.234375, + "learning_rate": 0.00019683335532392137, + "loss": 0.0437, + "step": 125 + }, + { + "epoch": 0.03427592116538132, + "grad_norm": 1.3046875, + "learning_rate": 0.00019670141179575144, + "loss": 0.0967, + "step": 130 + }, + { + "epoch": 0.03559422582558829, + "grad_norm": 0.2734375, + "learning_rate": 0.00019656946826758148, + "loss": 0.0132, + "step": 135 + }, + { + "epoch": 0.036912530485795264, + "grad_norm": 0.66015625, + "learning_rate": 0.00019643752473941155, + "loss": 0.0396, + "step": 140 + }, + { + "epoch": 0.03823083514600224, + "grad_norm": 1.0546875, + "learning_rate": 0.0001963055812112416, + "loss": 0.0449, + "step": 145 + }, + { + "epoch": 0.039549139806209214, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019617363768307166, + "loss": 0.1196, + "step": 150 + }, + { + "epoch": 0.040867444466416186, + "grad_norm": 0.5859375, + "learning_rate": 0.0001960416941549017, + "loss": 0.0588, + "step": 155 + }, + { + "epoch": 0.042185749126623165, + "grad_norm": 0.06005859375, + "learning_rate": 0.00019590975062673175, + "loss": 0.0234, + "step": 160 + }, + { + "epoch": 0.04350405378683014, + "grad_norm": 0.4921875, + "learning_rate": 0.00019577780709856182, + "loss": 0.0916, + "step": 165 + }, + { + "epoch": 0.04482235844703711, + "grad_norm": 0.84375, + "learning_rate": 0.0001956458635703919, + "loss": 0.0271, + "step": 170 + }, + { + "epoch": 0.04614066310724409, + "grad_norm": 0.8828125, + "learning_rate": 0.00019551392004222193, + "loss": 0.0175, + "step": 175 + }, + { + "epoch": 0.04745896776745106, + "grad_norm": 0.0152587890625, + "learning_rate": 0.000195381976514052, + "loss": 0.0356, + "step": 180 + }, + { + "epoch": 0.04877727242765803, + "grad_norm": 0.09326171875, + "learning_rate": 0.00019525003298588204, + "loss": 0.0057, + "step": 185 + }, + { + "epoch": 0.050095577087865, + "grad_norm": 0.24609375, + "learning_rate": 0.0001951180894577121, + "loss": 0.0082, + "step": 190 + }, + { + "epoch": 0.05141388174807198, + "grad_norm": 0.05029296875, + "learning_rate": 0.00019498614592954215, + "loss": 0.0178, + "step": 195 + }, + { + "epoch": 0.05273218640827895, + "grad_norm": 0.0390625, + "learning_rate": 0.00019485420240137222, + "loss": 0.0789, + "step": 200 + }, + { + "epoch": 0.054050491068485924, + "grad_norm": 0.5625, + "learning_rate": 0.0001947222588732023, + "loss": 0.0645, + "step": 205 + }, + { + "epoch": 0.0553687957286929, + "grad_norm": 0.53515625, + "learning_rate": 0.00019459031534503233, + "loss": 0.116, + "step": 210 + }, + { + "epoch": 0.056687100388899875, + "grad_norm": 0.55078125, + "learning_rate": 0.0001944583718168624, + "loss": 0.0516, + "step": 215 + }, + { + "epoch": 0.058005405049106847, + "grad_norm": 0.314453125, + "learning_rate": 0.00019432642828869244, + "loss": 0.1019, + "step": 220 + }, + { + "epoch": 0.059323709709313825, + "grad_norm": 0.1123046875, + "learning_rate": 0.0001941944847605225, + "loss": 0.0529, + "step": 225 + }, + { + "epoch": 0.0606420143695208, + "grad_norm": 0.4921875, + "learning_rate": 0.00019406254123235256, + "loss": 0.0368, + "step": 230 + }, + { + "epoch": 0.06196031902972777, + "grad_norm": 0.054443359375, + "learning_rate": 0.00019393059770418262, + "loss": 0.037, + "step": 235 + }, + { + "epoch": 0.06327862368993474, + "grad_norm": 0.008544921875, + "learning_rate": 0.0001937986541760127, + "loss": 0.0324, + "step": 240 + }, + { + "epoch": 0.06459692835014172, + "grad_norm": 1.5, + "learning_rate": 0.00019366671064784274, + "loss": 0.0334, + "step": 245 + }, + { + "epoch": 0.0659152330103487, + "grad_norm": 0.2109375, + "learning_rate": 0.0001935347671196728, + "loss": 0.0671, + "step": 250 + }, + { + "epoch": 0.06723353767055566, + "grad_norm": 2.0625, + "learning_rate": 0.00019340282359150285, + "loss": 0.1559, + "step": 255 + }, + { + "epoch": 0.06855184233076264, + "grad_norm": 0.7734375, + "learning_rate": 0.0001932708800633329, + "loss": 0.0198, + "step": 260 + }, + { + "epoch": 0.06987014699096962, + "grad_norm": 0.42578125, + "learning_rate": 0.00019313893653516296, + "loss": 0.0151, + "step": 265 + }, + { + "epoch": 0.07118845165117658, + "grad_norm": 0.1884765625, + "learning_rate": 0.000193006993006993, + "loss": 0.0269, + "step": 270 + }, + { + "epoch": 0.07250675631138356, + "grad_norm": 1.546875, + "learning_rate": 0.00019287504947882307, + "loss": 0.0565, + "step": 275 + }, + { + "epoch": 0.07382506097159053, + "grad_norm": 0.5078125, + "learning_rate": 0.0001927431059506531, + "loss": 0.0942, + "step": 280 + }, + { + "epoch": 0.0751433656317975, + "grad_norm": 0.392578125, + "learning_rate": 0.00019261116242248318, + "loss": 0.0061, + "step": 285 + }, + { + "epoch": 0.07646167029200449, + "grad_norm": 1.9140625, + "learning_rate": 0.00019247921889431325, + "loss": 0.0497, + "step": 290 + }, + { + "epoch": 0.07777997495221145, + "grad_norm": 0.08837890625, + "learning_rate": 0.0001923472753661433, + "loss": 0.0573, + "step": 295 + }, + { + "epoch": 0.07909827961241843, + "grad_norm": 1.046875, + "learning_rate": 0.00019221533183797336, + "loss": 0.0528, + "step": 300 + }, + { + "epoch": 0.08041658427262541, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001920833883098034, + "loss": 0.0506, + "step": 305 + }, + { + "epoch": 0.08173488893283237, + "grad_norm": 0.08203125, + "learning_rate": 0.00019195144478163347, + "loss": 0.0307, + "step": 310 + }, + { + "epoch": 0.08305319359303935, + "grad_norm": 0.111328125, + "learning_rate": 0.00019181950125346354, + "loss": 0.0365, + "step": 315 + }, + { + "epoch": 0.08437149825324633, + "grad_norm": 1.2890625, + "learning_rate": 0.00019168755772529358, + "loss": 0.0447, + "step": 320 + }, + { + "epoch": 0.0856898029134533, + "grad_norm": 0.6015625, + "learning_rate": 0.00019155561419712365, + "loss": 0.0605, + "step": 325 + }, + { + "epoch": 0.08700810757366027, + "grad_norm": 0.71875, + "learning_rate": 0.0001914236706689537, + "loss": 0.0846, + "step": 330 + }, + { + "epoch": 0.08832641223386725, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019129172714078376, + "loss": 0.0713, + "step": 335 + }, + { + "epoch": 0.08964471689407422, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001911597836126138, + "loss": 0.0826, + "step": 340 + }, + { + "epoch": 0.0909630215542812, + "grad_norm": 2.203125, + "learning_rate": 0.00019102784008444388, + "loss": 0.0441, + "step": 345 + }, + { + "epoch": 0.09228132621448817, + "grad_norm": 1.21875, + "learning_rate": 0.00019089589655627395, + "loss": 0.1378, + "step": 350 + }, + { + "epoch": 0.09359963087469514, + "grad_norm": 3.0625, + "learning_rate": 0.00019076395302810396, + "loss": 0.1552, + "step": 355 + }, + { + "epoch": 0.09491793553490212, + "grad_norm": 0.232421875, + "learning_rate": 0.00019063200949993403, + "loss": 0.0458, + "step": 360 + }, + { + "epoch": 0.0962362401951091, + "grad_norm": 0.71875, + "learning_rate": 0.0001905000659717641, + "loss": 0.0312, + "step": 365 + }, + { + "epoch": 0.09755454485531606, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00019036812244359414, + "loss": 0.0247, + "step": 370 + }, + { + "epoch": 0.09887284951552304, + "grad_norm": 0.064453125, + "learning_rate": 0.0001902361789154242, + "loss": 0.054, + "step": 375 + }, + { + "epoch": 0.10019115417573, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019010423538725425, + "loss": 0.0023, + "step": 380 + }, + { + "epoch": 0.10150945883593698, + "grad_norm": 0.0361328125, + "learning_rate": 0.00018997229185908432, + "loss": 0.0884, + "step": 385 + }, + { + "epoch": 0.10282776349614396, + "grad_norm": 1.703125, + "learning_rate": 0.00018984034833091436, + "loss": 0.0506, + "step": 390 + }, + { + "epoch": 0.10414606815635093, + "grad_norm": 0.08837890625, + "learning_rate": 0.00018970840480274443, + "loss": 0.1123, + "step": 395 + }, + { + "epoch": 0.1054643728165579, + "grad_norm": 0.6953125, + "learning_rate": 0.0001895764612745745, + "loss": 0.0597, + "step": 400 + }, + { + "epoch": 0.10678267747676488, + "grad_norm": 0.18359375, + "learning_rate": 0.00018944451774640454, + "loss": 0.0138, + "step": 405 + }, + { + "epoch": 0.10810098213697185, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001893125742182346, + "loss": 0.0249, + "step": 410 + }, + { + "epoch": 0.10941928679717883, + "grad_norm": 0.00970458984375, + "learning_rate": 0.00018918063069006466, + "loss": 0.0084, + "step": 415 + }, + { + "epoch": 0.1107375914573858, + "grad_norm": 0.54296875, + "learning_rate": 0.00018904868716189472, + "loss": 0.0541, + "step": 420 + }, + { + "epoch": 0.11205589611759277, + "grad_norm": 0.74609375, + "learning_rate": 0.00018891674363372477, + "loss": 0.007, + "step": 425 + }, + { + "epoch": 0.11337420077779975, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00018878480010555484, + "loss": 0.0875, + "step": 430 + }, + { + "epoch": 0.11469250543800673, + "grad_norm": 0.9296875, + "learning_rate": 0.0001886528565773849, + "loss": 0.1207, + "step": 435 + }, + { + "epoch": 0.11601081009821369, + "grad_norm": 1.2734375, + "learning_rate": 0.00018852091304921495, + "loss": 0.1143, + "step": 440 + }, + { + "epoch": 0.11732911475842067, + "grad_norm": 0.6484375, + "learning_rate": 0.00018838896952104502, + "loss": 0.0393, + "step": 445 + }, + { + "epoch": 0.11864741941862765, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018825702599287506, + "loss": 0.02, + "step": 450 + }, + { + "epoch": 0.11996572407883462, + "grad_norm": 0.486328125, + "learning_rate": 0.0001881250824647051, + "loss": 0.0891, + "step": 455 + }, + { + "epoch": 0.1212840287390416, + "grad_norm": 1.0, + "learning_rate": 0.00018799313893653517, + "loss": 0.0469, + "step": 460 + }, + { + "epoch": 0.12260233339924857, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001878611954083652, + "loss": 0.019, + "step": 465 + }, + { + "epoch": 0.12392063805945554, + "grad_norm": 0.03857421875, + "learning_rate": 0.00018772925188019528, + "loss": 0.007, + "step": 470 + }, + { + "epoch": 0.12523894271966252, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00018759730835202532, + "loss": 0.0039, + "step": 475 + }, + { + "epoch": 0.12655724737986948, + "grad_norm": 0.014404296875, + "learning_rate": 0.0001874653648238554, + "loss": 0.0043, + "step": 480 + }, + { + "epoch": 0.12787555204007647, + "grad_norm": 0.51953125, + "learning_rate": 0.00018733342129568546, + "loss": 0.1326, + "step": 485 + }, + { + "epoch": 0.12919385670028344, + "grad_norm": 0.99609375, + "learning_rate": 0.0001872014777675155, + "loss": 0.0369, + "step": 490 + }, + { + "epoch": 0.1305121613604904, + "grad_norm": 0.2734375, + "learning_rate": 0.00018706953423934557, + "loss": 0.0395, + "step": 495 + }, + { + "epoch": 0.1318304660206974, + "grad_norm": 0.083984375, + "learning_rate": 0.00018693759071117561, + "loss": 0.0284, + "step": 500 + }, + { + "epoch": 0.1318304660206974, + "eval_loss": 0.04542969539761543, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.5293, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 3.685, + "step": 500 + }, + { + "epoch": 0.13314877068090436, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00018680564718300568, + "loss": 0.0533, + "step": 505 + }, + { + "epoch": 0.13446707534111133, + "grad_norm": 0.71484375, + "learning_rate": 0.00018667370365483575, + "loss": 0.0183, + "step": 510 + }, + { + "epoch": 0.13578538000131832, + "grad_norm": 0.018798828125, + "learning_rate": 0.0001865417601266658, + "loss": 0.0473, + "step": 515 + }, + { + "epoch": 0.13710368466152528, + "grad_norm": 0.388671875, + "learning_rate": 0.00018640981659849586, + "loss": 0.0562, + "step": 520 + }, + { + "epoch": 0.13842198932173225, + "grad_norm": 0.77734375, + "learning_rate": 0.0001862778730703259, + "loss": 0.0755, + "step": 525 + }, + { + "epoch": 0.13974029398193924, + "grad_norm": 2.8125, + "learning_rate": 0.00018614592954215598, + "loss": 0.0422, + "step": 530 + }, + { + "epoch": 0.1410585986421462, + "grad_norm": 0.48828125, + "learning_rate": 0.00018601398601398602, + "loss": 0.0882, + "step": 535 + }, + { + "epoch": 0.14237690330235317, + "grad_norm": 0.16015625, + "learning_rate": 0.0001858820424858161, + "loss": 0.0131, + "step": 540 + }, + { + "epoch": 0.14369520796256013, + "grad_norm": 0.31640625, + "learning_rate": 0.00018575009895764616, + "loss": 0.03, + "step": 545 + }, + { + "epoch": 0.14501351262276713, + "grad_norm": 0.0120849609375, + "learning_rate": 0.0001856181554294762, + "loss": 0.0425, + "step": 550 + }, + { + "epoch": 0.1463318172829741, + "grad_norm": 0.390625, + "learning_rate": 0.00018548621190130624, + "loss": 0.011, + "step": 555 + }, + { + "epoch": 0.14765012194318106, + "grad_norm": 1.9609375, + "learning_rate": 0.0001853542683731363, + "loss": 0.0807, + "step": 560 + }, + { + "epoch": 0.14896842660338805, + "grad_norm": 0.609375, + "learning_rate": 0.00018522232484496635, + "loss": 0.0278, + "step": 565 + }, + { + "epoch": 0.150286731263595, + "grad_norm": 0.087890625, + "learning_rate": 0.00018509038131679642, + "loss": 0.0484, + "step": 570 + }, + { + "epoch": 0.15160503592380198, + "grad_norm": 0.5078125, + "learning_rate": 0.00018495843778862646, + "loss": 0.1277, + "step": 575 + }, + { + "epoch": 0.15292334058400897, + "grad_norm": 0.8125, + "learning_rate": 0.00018482649426045653, + "loss": 0.058, + "step": 580 + }, + { + "epoch": 0.15424164524421594, + "grad_norm": 0.22265625, + "learning_rate": 0.00018469455073228657, + "loss": 0.0259, + "step": 585 + }, + { + "epoch": 0.1555599499044229, + "grad_norm": 1.8984375, + "learning_rate": 0.00018456260720411664, + "loss": 0.113, + "step": 590 + }, + { + "epoch": 0.1568782545646299, + "grad_norm": 0.12451171875, + "learning_rate": 0.0001844306636759467, + "loss": 0.0312, + "step": 595 + }, + { + "epoch": 0.15819655922483686, + "grad_norm": 0.0322265625, + "learning_rate": 0.00018429872014777676, + "loss": 0.0476, + "step": 600 + }, + { + "epoch": 0.15951486388504382, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00018416677661960682, + "loss": 0.0232, + "step": 605 + }, + { + "epoch": 0.16083316854525082, + "grad_norm": 0.57421875, + "learning_rate": 0.00018403483309143687, + "loss": 0.1287, + "step": 610 + }, + { + "epoch": 0.16215147320545778, + "grad_norm": 0.765625, + "learning_rate": 0.00018390288956326694, + "loss": 0.0991, + "step": 615 + }, + { + "epoch": 0.16346977786566474, + "grad_norm": 0.3125, + "learning_rate": 0.00018377094603509698, + "loss": 0.0247, + "step": 620 + }, + { + "epoch": 0.16478808252587174, + "grad_norm": 0.37890625, + "learning_rate": 0.00018363900250692705, + "loss": 0.0632, + "step": 625 + }, + { + "epoch": 0.1661063871860787, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018350705897875712, + "loss": 0.0314, + "step": 630 + }, + { + "epoch": 0.16742469184628567, + "grad_norm": 0.0673828125, + "learning_rate": 0.00018337511545058716, + "loss": 0.0425, + "step": 635 + }, + { + "epoch": 0.16874299650649266, + "grad_norm": 0.396484375, + "learning_rate": 0.00018324317192241723, + "loss": 0.0613, + "step": 640 + }, + { + "epoch": 0.17006130116669962, + "grad_norm": 0.057373046875, + "learning_rate": 0.00018311122839424727, + "loss": 0.0569, + "step": 645 + }, + { + "epoch": 0.1713796058269066, + "grad_norm": 0.001373291015625, + "learning_rate": 0.00018297928486607734, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 0.17269791048711358, + "grad_norm": 1.0859375, + "learning_rate": 0.00018284734133790738, + "loss": 0.0189, + "step": 655 + }, + { + "epoch": 0.17401621514732055, + "grad_norm": 0.6015625, + "learning_rate": 0.00018271539780973742, + "loss": 0.0601, + "step": 660 + }, + { + "epoch": 0.1753345198075275, + "grad_norm": 0.25390625, + "learning_rate": 0.0001825834542815675, + "loss": 0.0211, + "step": 665 + }, + { + "epoch": 0.1766528244677345, + "grad_norm": 2.6875, + "learning_rate": 0.00018245151075339753, + "loss": 0.0713, + "step": 670 + }, + { + "epoch": 0.17797112912794147, + "grad_norm": 1.1875, + "learning_rate": 0.0001823195672252276, + "loss": 0.0522, + "step": 675 + }, + { + "epoch": 0.17928943378814843, + "grad_norm": 0.025146484375, + "learning_rate": 0.00018218762369705767, + "loss": 0.0242, + "step": 680 + }, + { + "epoch": 0.18060773844835543, + "grad_norm": 0.048095703125, + "learning_rate": 0.00018205568016888772, + "loss": 0.0129, + "step": 685 + }, + { + "epoch": 0.1819260431085624, + "grad_norm": 0.04541015625, + "learning_rate": 0.00018192373664071778, + "loss": 0.0142, + "step": 690 + }, + { + "epoch": 0.18324434776876936, + "grad_norm": 0.00830078125, + "learning_rate": 0.00018179179311254783, + "loss": 0.0121, + "step": 695 + }, + { + "epoch": 0.18456265242897635, + "grad_norm": 0.53125, + "learning_rate": 0.0001816598495843779, + "loss": 0.0163, + "step": 700 + }, + { + "epoch": 0.1858809570891833, + "grad_norm": 0.185546875, + "learning_rate": 0.00018152790605620796, + "loss": 0.0203, + "step": 705 + }, + { + "epoch": 0.18719926174939028, + "grad_norm": 1.2578125, + "learning_rate": 0.000181395962528038, + "loss": 0.1548, + "step": 710 + }, + { + "epoch": 0.18851756640959727, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00018126401899986808, + "loss": 0.0543, + "step": 715 + }, + { + "epoch": 0.18983587106980424, + "grad_norm": 0.07568359375, + "learning_rate": 0.00018113207547169812, + "loss": 0.0346, + "step": 720 + }, + { + "epoch": 0.1911541757300112, + "grad_norm": 0.1318359375, + "learning_rate": 0.0001810001319435282, + "loss": 0.03, + "step": 725 + }, + { + "epoch": 0.1924724803902182, + "grad_norm": 0.1455078125, + "learning_rate": 0.00018086818841535823, + "loss": 0.0796, + "step": 730 + }, + { + "epoch": 0.19379078505042516, + "grad_norm": 0.09814453125, + "learning_rate": 0.0001807362448871883, + "loss": 0.0662, + "step": 735 + }, + { + "epoch": 0.19510908971063212, + "grad_norm": 0.91015625, + "learning_rate": 0.00018060430135901837, + "loss": 0.0675, + "step": 740 + }, + { + "epoch": 0.19642739437083911, + "grad_norm": 0.10693359375, + "learning_rate": 0.0001804723578308484, + "loss": 0.0377, + "step": 745 + }, + { + "epoch": 0.19774569903104608, + "grad_norm": 0.95703125, + "learning_rate": 0.00018034041430267848, + "loss": 0.0174, + "step": 750 + }, + { + "epoch": 0.19906400369125304, + "grad_norm": 1.7890625, + "learning_rate": 0.00018020847077450852, + "loss": 0.0278, + "step": 755 + }, + { + "epoch": 0.20038230835146, + "grad_norm": 0.8515625, + "learning_rate": 0.00018007652724633856, + "loss": 0.0113, + "step": 760 + }, + { + "epoch": 0.201700613011667, + "grad_norm": 0.016845703125, + "learning_rate": 0.00017994458371816863, + "loss": 0.0589, + "step": 765 + }, + { + "epoch": 0.20301891767187397, + "grad_norm": 0.01043701171875, + "learning_rate": 0.00017981264018999867, + "loss": 0.0203, + "step": 770 + }, + { + "epoch": 0.20433722233208093, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00017968069666182874, + "loss": 0.0494, + "step": 775 + }, + { + "epoch": 0.20565552699228792, + "grad_norm": 0.56640625, + "learning_rate": 0.00017954875313365879, + "loss": 0.0394, + "step": 780 + }, + { + "epoch": 0.2069738316524949, + "grad_norm": 0.06591796875, + "learning_rate": 0.00017941680960548886, + "loss": 0.0848, + "step": 785 + }, + { + "epoch": 0.20829213631270185, + "grad_norm": 0.40234375, + "learning_rate": 0.00017928486607731892, + "loss": 0.0464, + "step": 790 + }, + { + "epoch": 0.20961044097290885, + "grad_norm": 0.06298828125, + "learning_rate": 0.00017915292254914897, + "loss": 0.0222, + "step": 795 + }, + { + "epoch": 0.2109287456331158, + "grad_norm": 0.5390625, + "learning_rate": 0.00017902097902097904, + "loss": 0.0434, + "step": 800 + }, + { + "epoch": 0.21224705029332278, + "grad_norm": 1.390625, + "learning_rate": 0.00017888903549280908, + "loss": 0.0222, + "step": 805 + }, + { + "epoch": 0.21356535495352977, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00017875709196463915, + "loss": 0.0099, + "step": 810 + }, + { + "epoch": 0.21488365961373673, + "grad_norm": 0.10009765625, + "learning_rate": 0.0001786251484364692, + "loss": 0.0086, + "step": 815 + }, + { + "epoch": 0.2162019642739437, + "grad_norm": 0.06396484375, + "learning_rate": 0.00017849320490829926, + "loss": 0.0715, + "step": 820 + }, + { + "epoch": 0.2175202689341507, + "grad_norm": 0.365234375, + "learning_rate": 0.00017836126138012933, + "loss": 0.0642, + "step": 825 + }, + { + "epoch": 0.21883857359435765, + "grad_norm": 0.01519775390625, + "learning_rate": 0.00017822931785195937, + "loss": 0.0111, + "step": 830 + }, + { + "epoch": 0.22015687825456462, + "grad_norm": 1.1640625, + "learning_rate": 0.00017809737432378944, + "loss": 0.0518, + "step": 835 + }, + { + "epoch": 0.2214751829147716, + "grad_norm": 0.00921630859375, + "learning_rate": 0.00017796543079561948, + "loss": 0.0384, + "step": 840 + }, + { + "epoch": 0.22279348757497858, + "grad_norm": 0.33984375, + "learning_rate": 0.00017783348726744955, + "loss": 0.0204, + "step": 845 + }, + { + "epoch": 0.22411179223518554, + "grad_norm": 0.294921875, + "learning_rate": 0.00017770154373927962, + "loss": 0.0075, + "step": 850 + }, + { + "epoch": 0.22543009689539253, + "grad_norm": 0.033203125, + "learning_rate": 0.00017756960021110963, + "loss": 0.0895, + "step": 855 + }, + { + "epoch": 0.2267484015555995, + "grad_norm": 0.08056640625, + "learning_rate": 0.0001774376566829397, + "loss": 0.1039, + "step": 860 + }, + { + "epoch": 0.22806670621580646, + "grad_norm": 0.55078125, + "learning_rate": 0.00017730571315476975, + "loss": 0.0125, + "step": 865 + }, + { + "epoch": 0.22938501087601346, + "grad_norm": 0.5859375, + "learning_rate": 0.00017717376962659982, + "loss": 0.0381, + "step": 870 + }, + { + "epoch": 0.23070331553622042, + "grad_norm": 0.029052734375, + "learning_rate": 0.00017704182609842988, + "loss": 0.0434, + "step": 875 + }, + { + "epoch": 0.23202162019642739, + "grad_norm": 0.43359375, + "learning_rate": 0.00017690988257025993, + "loss": 0.0799, + "step": 880 + }, + { + "epoch": 0.23333992485663438, + "grad_norm": 0.04150390625, + "learning_rate": 0.00017677793904209, + "loss": 0.0692, + "step": 885 + }, + { + "epoch": 0.23465822951684134, + "grad_norm": 0.435546875, + "learning_rate": 0.00017664599551392004, + "loss": 0.0544, + "step": 890 + }, + { + "epoch": 0.2359765341770483, + "grad_norm": 1.171875, + "learning_rate": 0.0001765140519857501, + "loss": 0.0619, + "step": 895 + }, + { + "epoch": 0.2372948388372553, + "grad_norm": 0.01263427734375, + "learning_rate": 0.00017638210845758018, + "loss": 0.0418, + "step": 900 + }, + { + "epoch": 0.23861314349746227, + "grad_norm": 0.017578125, + "learning_rate": 0.00017625016492941022, + "loss": 0.0195, + "step": 905 + }, + { + "epoch": 0.23993144815766923, + "grad_norm": 0.6171875, + "learning_rate": 0.0001761182214012403, + "loss": 0.067, + "step": 910 + }, + { + "epoch": 0.24124975281787622, + "grad_norm": 0.59765625, + "learning_rate": 0.00017598627787307033, + "loss": 0.049, + "step": 915 + }, + { + "epoch": 0.2425680574780832, + "grad_norm": 1.2421875, + "learning_rate": 0.0001758543343449004, + "loss": 0.0539, + "step": 920 + }, + { + "epoch": 0.24388636213829015, + "grad_norm": 0.10302734375, + "learning_rate": 0.00017572239081673044, + "loss": 0.0725, + "step": 925 + }, + { + "epoch": 0.24520466679849715, + "grad_norm": 0.330078125, + "learning_rate": 0.0001755904472885605, + "loss": 0.064, + "step": 930 + }, + { + "epoch": 0.2465229714587041, + "grad_norm": 0.220703125, + "learning_rate": 0.00017545850376039058, + "loss": 0.0271, + "step": 935 + }, + { + "epoch": 0.24784127611891107, + "grad_norm": 0.01470947265625, + "learning_rate": 0.00017532656023222062, + "loss": 0.0247, + "step": 940 + }, + { + "epoch": 0.24915958077911807, + "grad_norm": 0.013427734375, + "learning_rate": 0.0001751946167040507, + "loss": 0.017, + "step": 945 + }, + { + "epoch": 0.25047788543932503, + "grad_norm": 0.58984375, + "learning_rate": 0.00017506267317588073, + "loss": 0.0254, + "step": 950 + }, + { + "epoch": 0.251796190099532, + "grad_norm": 0.412109375, + "learning_rate": 0.00017493072964771078, + "loss": 0.0186, + "step": 955 + }, + { + "epoch": 0.25311449475973896, + "grad_norm": 0.66796875, + "learning_rate": 0.00017479878611954084, + "loss": 0.0617, + "step": 960 + }, + { + "epoch": 0.25443279941994595, + "grad_norm": 0.322265625, + "learning_rate": 0.00017466684259137089, + "loss": 0.0173, + "step": 965 + }, + { + "epoch": 0.25575110408015295, + "grad_norm": 0.83203125, + "learning_rate": 0.00017453489906320096, + "loss": 0.0512, + "step": 970 + }, + { + "epoch": 0.2570694087403599, + "grad_norm": 0.08447265625, + "learning_rate": 0.000174402955535031, + "loss": 0.0361, + "step": 975 + }, + { + "epoch": 0.2583877134005669, + "grad_norm": 0.423828125, + "learning_rate": 0.00017427101200686107, + "loss": 0.0175, + "step": 980 + }, + { + "epoch": 0.25970601806077387, + "grad_norm": 0.77734375, + "learning_rate": 0.00017413906847869114, + "loss": 0.0139, + "step": 985 + }, + { + "epoch": 0.2610243227209808, + "grad_norm": 0.515625, + "learning_rate": 0.00017400712495052118, + "loss": 0.0948, + "step": 990 + }, + { + "epoch": 0.2623426273811878, + "grad_norm": 1.421875, + "learning_rate": 0.00017387518142235125, + "loss": 0.0406, + "step": 995 + }, + { + "epoch": 0.2636609320413948, + "grad_norm": 0.058837890625, + "learning_rate": 0.0001737432378941813, + "loss": 0.1011, + "step": 1000 + }, + { + "epoch": 0.2636609320413948, + "eval_loss": 0.045552924275398254, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.6113, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 3.684, + "step": 1000 + }, + { + "epoch": 0.26497923670160173, + "grad_norm": 0.380859375, + "learning_rate": 0.00017361129436601136, + "loss": 0.0711, + "step": 1005 + }, + { + "epoch": 0.2662975413618087, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00017347935083784143, + "loss": 0.0218, + "step": 1010 + }, + { + "epoch": 0.2676158460220157, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017334740730967147, + "loss": 0.0301, + "step": 1015 + }, + { + "epoch": 0.26893415068222265, + "grad_norm": 0.2734375, + "learning_rate": 0.00017321546378150154, + "loss": 0.0721, + "step": 1020 + }, + { + "epoch": 0.27025245534242964, + "grad_norm": 0.25390625, + "learning_rate": 0.00017308352025333158, + "loss": 0.0363, + "step": 1025 + }, + { + "epoch": 0.27157076000263664, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017295157672516165, + "loss": 0.0313, + "step": 1030 + }, + { + "epoch": 0.2728890646628436, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0001728196331969917, + "loss": 0.0385, + "step": 1035 + }, + { + "epoch": 0.27420736932305056, + "grad_norm": 0.00787353515625, + "learning_rate": 0.00017268768966882176, + "loss": 0.0405, + "step": 1040 + }, + { + "epoch": 0.27552567398325756, + "grad_norm": 0.484375, + "learning_rate": 0.00017255574614065183, + "loss": 0.0616, + "step": 1045 + }, + { + "epoch": 0.2768439786434645, + "grad_norm": 0.0908203125, + "learning_rate": 0.00017242380261248185, + "loss": 0.0057, + "step": 1050 + }, + { + "epoch": 0.2781622833036715, + "grad_norm": 0.1904296875, + "learning_rate": 0.00017229185908431192, + "loss": 0.0417, + "step": 1055 + }, + { + "epoch": 0.2794805879638785, + "grad_norm": 0.30078125, + "learning_rate": 0.00017215991555614196, + "loss": 0.0346, + "step": 1060 + }, + { + "epoch": 0.2807988926240854, + "grad_norm": 0.016357421875, + "learning_rate": 0.00017202797202797203, + "loss": 0.0295, + "step": 1065 + }, + { + "epoch": 0.2821171972842924, + "grad_norm": 0.490234375, + "learning_rate": 0.0001718960284998021, + "loss": 0.0448, + "step": 1070 + }, + { + "epoch": 0.28343550194449935, + "grad_norm": 0.004241943359375, + "learning_rate": 0.00017176408497163214, + "loss": 0.0051, + "step": 1075 + }, + { + "epoch": 0.28475380660470634, + "grad_norm": 0.01904296875, + "learning_rate": 0.0001716321414434622, + "loss": 0.0894, + "step": 1080 + }, + { + "epoch": 0.28607211126491333, + "grad_norm": 0.83984375, + "learning_rate": 0.00017150019791529225, + "loss": 0.0288, + "step": 1085 + }, + { + "epoch": 0.28739041592512027, + "grad_norm": 0.2021484375, + "learning_rate": 0.00017136825438712232, + "loss": 0.0222, + "step": 1090 + }, + { + "epoch": 0.28870872058532726, + "grad_norm": 0.322265625, + "learning_rate": 0.0001712363108589524, + "loss": 0.0444, + "step": 1095 + }, + { + "epoch": 0.29002702524553425, + "grad_norm": 0.408203125, + "learning_rate": 0.00017110436733078243, + "loss": 0.0828, + "step": 1100 + }, + { + "epoch": 0.2913453299057412, + "grad_norm": 0.04052734375, + "learning_rate": 0.0001709724238026125, + "loss": 0.0725, + "step": 1105 + }, + { + "epoch": 0.2926636345659482, + "grad_norm": 0.2578125, + "learning_rate": 0.00017084048027444254, + "loss": 0.0204, + "step": 1110 + }, + { + "epoch": 0.2939819392261552, + "grad_norm": 0.67578125, + "learning_rate": 0.0001707085367462726, + "loss": 0.0503, + "step": 1115 + }, + { + "epoch": 0.2953002438863621, + "grad_norm": 0.0059814453125, + "learning_rate": 0.00017057659321810265, + "loss": 0.0144, + "step": 1120 + }, + { + "epoch": 0.2966185485465691, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00017044464968993272, + "loss": 0.0044, + "step": 1125 + }, + { + "epoch": 0.2979368532067761, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001703127061617628, + "loss": 0.013, + "step": 1130 + }, + { + "epoch": 0.29925515786698303, + "grad_norm": 0.287109375, + "learning_rate": 0.00017018076263359283, + "loss": 0.0245, + "step": 1135 + }, + { + "epoch": 0.30057346252719, + "grad_norm": 0.26171875, + "learning_rate": 0.0001700488191054229, + "loss": 0.0247, + "step": 1140 + }, + { + "epoch": 0.301891767187397, + "grad_norm": 0.40625, + "learning_rate": 0.00016991687557725294, + "loss": 0.0402, + "step": 1145 + }, + { + "epoch": 0.30321007184760396, + "grad_norm": 1.2578125, + "learning_rate": 0.000169784932049083, + "loss": 0.0071, + "step": 1150 + }, + { + "epoch": 0.30452837650781095, + "grad_norm": 0.330078125, + "learning_rate": 0.00016965298852091306, + "loss": 0.0177, + "step": 1155 + }, + { + "epoch": 0.30584668116801794, + "grad_norm": 0.07275390625, + "learning_rate": 0.0001695210449927431, + "loss": 0.0029, + "step": 1160 + }, + { + "epoch": 0.3071649858282249, + "grad_norm": 0.455078125, + "learning_rate": 0.00016938910146457317, + "loss": 0.0262, + "step": 1165 + }, + { + "epoch": 0.30848329048843187, + "grad_norm": 0.002655029296875, + "learning_rate": 0.0001692571579364032, + "loss": 0.0346, + "step": 1170 + }, + { + "epoch": 0.30980159514863886, + "grad_norm": 0.1748046875, + "learning_rate": 0.00016912521440823328, + "loss": 0.0494, + "step": 1175 + }, + { + "epoch": 0.3111198998088458, + "grad_norm": 1.4609375, + "learning_rate": 0.00016899327088006335, + "loss": 0.0603, + "step": 1180 + }, + { + "epoch": 0.3124382044690528, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001688613273518934, + "loss": 0.0366, + "step": 1185 + }, + { + "epoch": 0.3137565091292598, + "grad_norm": 0.01422119140625, + "learning_rate": 0.00016872938382372346, + "loss": 0.0678, + "step": 1190 + }, + { + "epoch": 0.3150748137894667, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001685974402955535, + "loss": 0.0359, + "step": 1195 + }, + { + "epoch": 0.3163931184496737, + "grad_norm": 0.275390625, + "learning_rate": 0.00016846549676738357, + "loss": 0.1099, + "step": 1200 + }, + { + "epoch": 0.3177114231098807, + "grad_norm": 0.212890625, + "learning_rate": 0.00016833355323921364, + "loss": 0.0343, + "step": 1205 + }, + { + "epoch": 0.31902972777008765, + "grad_norm": 0.0302734375, + "learning_rate": 0.00016820160971104368, + "loss": 0.0138, + "step": 1210 + }, + { + "epoch": 0.32034803243029464, + "grad_norm": 0.016845703125, + "learning_rate": 0.00016806966618287375, + "loss": 0.0202, + "step": 1215 + }, + { + "epoch": 0.32166633709050163, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001679377226547038, + "loss": 0.0442, + "step": 1220 + }, + { + "epoch": 0.32298464175070857, + "grad_norm": 0.049072265625, + "learning_rate": 0.00016780577912653386, + "loss": 0.0375, + "step": 1225 + }, + { + "epoch": 0.32430294641091556, + "grad_norm": 0.1337890625, + "learning_rate": 0.0001676738355983639, + "loss": 0.01, + "step": 1230 + }, + { + "epoch": 0.32562125107112255, + "grad_norm": 0.02197265625, + "learning_rate": 0.00016754189207019397, + "loss": 0.0139, + "step": 1235 + }, + { + "epoch": 0.3269395557313295, + "grad_norm": 0.09228515625, + "learning_rate": 0.00016740994854202404, + "loss": 0.014, + "step": 1240 + }, + { + "epoch": 0.3282578603915365, + "grad_norm": 0.47265625, + "learning_rate": 0.00016727800501385408, + "loss": 0.1546, + "step": 1245 + }, + { + "epoch": 0.3295761650517435, + "grad_norm": 0.02294921875, + "learning_rate": 0.00016714606148568413, + "loss": 0.0803, + "step": 1250 + }, + { + "epoch": 0.3308944697119504, + "grad_norm": 0.185546875, + "learning_rate": 0.00016701411795751417, + "loss": 0.0376, + "step": 1255 + }, + { + "epoch": 0.3322127743721574, + "grad_norm": 0.1123046875, + "learning_rate": 0.00016688217442934424, + "loss": 0.0375, + "step": 1260 + }, + { + "epoch": 0.3335310790323644, + "grad_norm": 1.03125, + "learning_rate": 0.0001667502309011743, + "loss": 0.0442, + "step": 1265 + }, + { + "epoch": 0.33484938369257133, + "grad_norm": 0.0172119140625, + "learning_rate": 0.00016661828737300435, + "loss": 0.0261, + "step": 1270 + }, + { + "epoch": 0.3361676883527783, + "grad_norm": 0.42578125, + "learning_rate": 0.00016648634384483442, + "loss": 0.0553, + "step": 1275 + }, + { + "epoch": 0.3374859930129853, + "grad_norm": 0.1328125, + "learning_rate": 0.00016635440031666446, + "loss": 0.0065, + "step": 1280 + }, + { + "epoch": 0.33880429767319226, + "grad_norm": 0.263671875, + "learning_rate": 0.00016622245678849453, + "loss": 0.0527, + "step": 1285 + }, + { + "epoch": 0.34012260233339925, + "grad_norm": 0.314453125, + "learning_rate": 0.0001660905132603246, + "loss": 0.0297, + "step": 1290 + }, + { + "epoch": 0.34144090699360624, + "grad_norm": 0.04345703125, + "learning_rate": 0.00016595856973215464, + "loss": 0.0477, + "step": 1295 + }, + { + "epoch": 0.3427592116538132, + "grad_norm": 0.08154296875, + "learning_rate": 0.0001658266262039847, + "loss": 0.0298, + "step": 1300 + }, + { + "epoch": 0.34407751631402017, + "grad_norm": 0.08935546875, + "learning_rate": 0.00016569468267581475, + "loss": 0.0481, + "step": 1305 + }, + { + "epoch": 0.34539582097422716, + "grad_norm": 0.06640625, + "learning_rate": 0.00016556273914764482, + "loss": 0.0153, + "step": 1310 + }, + { + "epoch": 0.3467141256344341, + "grad_norm": 0.00592041015625, + "learning_rate": 0.00016543079561947486, + "loss": 0.0111, + "step": 1315 + }, + { + "epoch": 0.3480324302946411, + "grad_norm": 0.2236328125, + "learning_rate": 0.00016529885209130493, + "loss": 0.0309, + "step": 1320 + }, + { + "epoch": 0.3493507349548481, + "grad_norm": 0.0198974609375, + "learning_rate": 0.000165166908563135, + "loss": 0.0579, + "step": 1325 + }, + { + "epoch": 0.350669039615055, + "grad_norm": 0.10107421875, + "learning_rate": 0.00016503496503496504, + "loss": 0.0055, + "step": 1330 + }, + { + "epoch": 0.351987344275262, + "grad_norm": 0.71875, + "learning_rate": 0.00016490302150679511, + "loss": 0.0299, + "step": 1335 + }, + { + "epoch": 0.353305648935469, + "grad_norm": 0.01348876953125, + "learning_rate": 0.00016477107797862516, + "loss": 0.0943, + "step": 1340 + }, + { + "epoch": 0.35462395359567594, + "grad_norm": 0.3046875, + "learning_rate": 0.00016463913445045523, + "loss": 0.0216, + "step": 1345 + }, + { + "epoch": 0.35594225825588294, + "grad_norm": 0.02392578125, + "learning_rate": 0.00016450719092228527, + "loss": 0.0265, + "step": 1350 + }, + { + "epoch": 0.35726056291608993, + "grad_norm": 0.453125, + "learning_rate": 0.0001643752473941153, + "loss": 0.0539, + "step": 1355 + }, + { + "epoch": 0.35857886757629687, + "grad_norm": 0.00823974609375, + "learning_rate": 0.00016424330386594538, + "loss": 0.0139, + "step": 1360 + }, + { + "epoch": 0.35989717223650386, + "grad_norm": 0.55859375, + "learning_rate": 0.00016411136033777542, + "loss": 0.0428, + "step": 1365 + }, + { + "epoch": 0.36121547689671085, + "grad_norm": 0.052734375, + "learning_rate": 0.0001639794168096055, + "loss": 0.0346, + "step": 1370 + }, + { + "epoch": 0.3625337815569178, + "grad_norm": 0.12158203125, + "learning_rate": 0.00016384747328143556, + "loss": 0.0095, + "step": 1375 + }, + { + "epoch": 0.3638520862171248, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0001637155297532656, + "loss": 0.0224, + "step": 1380 + }, + { + "epoch": 0.3651703908773318, + "grad_norm": 0.01318359375, + "learning_rate": 0.00016358358622509567, + "loss": 0.0316, + "step": 1385 + }, + { + "epoch": 0.3664886955375387, + "grad_norm": 0.011962890625, + "learning_rate": 0.0001634516426969257, + "loss": 0.0051, + "step": 1390 + }, + { + "epoch": 0.3678070001977457, + "grad_norm": 0.00396728515625, + "learning_rate": 0.00016331969916875578, + "loss": 0.038, + "step": 1395 + }, + { + "epoch": 0.3691253048579527, + "grad_norm": 0.375, + "learning_rate": 0.00016318775564058585, + "loss": 0.029, + "step": 1400 + }, + { + "epoch": 0.37044360951815963, + "grad_norm": 0.265625, + "learning_rate": 0.0001630558121124159, + "loss": 0.0072, + "step": 1405 + }, + { + "epoch": 0.3717619141783666, + "grad_norm": 0.00127410888671875, + "learning_rate": 0.00016292386858424596, + "loss": 0.0381, + "step": 1410 + }, + { + "epoch": 0.3730802188385736, + "grad_norm": 1.15625, + "learning_rate": 0.000162791925056076, + "loss": 0.0573, + "step": 1415 + }, + { + "epoch": 0.37439852349878056, + "grad_norm": 0.0244140625, + "learning_rate": 0.00016265998152790607, + "loss": 0.051, + "step": 1420 + }, + { + "epoch": 0.37571682815898755, + "grad_norm": 0.0015106201171875, + "learning_rate": 0.00016252803799973612, + "loss": 0.0239, + "step": 1425 + }, + { + "epoch": 0.37703513281919454, + "grad_norm": 0.26953125, + "learning_rate": 0.00016239609447156618, + "loss": 0.0165, + "step": 1430 + }, + { + "epoch": 0.3783534374794015, + "grad_norm": 0.006134033203125, + "learning_rate": 0.00016226415094339625, + "loss": 0.0071, + "step": 1435 + }, + { + "epoch": 0.37967174213960847, + "grad_norm": 2.828125, + "learning_rate": 0.0001621322074152263, + "loss": 0.0272, + "step": 1440 + }, + { + "epoch": 0.38099004679981546, + "grad_norm": 0.349609375, + "learning_rate": 0.00016200026388705637, + "loss": 0.0647, + "step": 1445 + }, + { + "epoch": 0.3823083514600224, + "grad_norm": 0.09326171875, + "learning_rate": 0.00016186832035888638, + "loss": 0.0262, + "step": 1450 + }, + { + "epoch": 0.3836266561202294, + "grad_norm": 0.041015625, + "learning_rate": 0.00016173637683071645, + "loss": 0.0576, + "step": 1455 + }, + { + "epoch": 0.3849449607804364, + "grad_norm": 0.033935546875, + "learning_rate": 0.00016160443330254652, + "loss": 0.0142, + "step": 1460 + }, + { + "epoch": 0.3862632654406433, + "grad_norm": 0.09130859375, + "learning_rate": 0.00016147248977437656, + "loss": 0.0348, + "step": 1465 + }, + { + "epoch": 0.3875815701008503, + "grad_norm": 2.390625, + "learning_rate": 0.00016134054624620663, + "loss": 0.0672, + "step": 1470 + }, + { + "epoch": 0.3888998747610573, + "grad_norm": 0.439453125, + "learning_rate": 0.00016120860271803667, + "loss": 0.0121, + "step": 1475 + }, + { + "epoch": 0.39021817942126424, + "grad_norm": 0.1298828125, + "learning_rate": 0.00016107665918986674, + "loss": 0.0114, + "step": 1480 + }, + { + "epoch": 0.39153648408147124, + "grad_norm": 0.85546875, + "learning_rate": 0.0001609447156616968, + "loss": 0.0968, + "step": 1485 + }, + { + "epoch": 0.39285478874167823, + "grad_norm": 0.703125, + "learning_rate": 0.00016081277213352685, + "loss": 0.0349, + "step": 1490 + }, + { + "epoch": 0.39417309340188517, + "grad_norm": 0.021728515625, + "learning_rate": 0.00016068082860535692, + "loss": 0.0106, + "step": 1495 + }, + { + "epoch": 0.39549139806209216, + "grad_norm": 0.7265625, + "learning_rate": 0.00016054888507718696, + "loss": 0.0225, + "step": 1500 + }, + { + "epoch": 0.39549139806209216, + "eval_loss": 0.03515048325061798, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.3497, + "eval_samples_per_second": 7.373, + "eval_steps_per_second": 3.686, + "step": 1500 + }, + { + "epoch": 0.3968097027222991, + "grad_norm": 0.016519820317626, + "learning_rate": 0.00016041694154901703, + "loss": 0.0202, + "step": 1505 + }, + { + "epoch": 0.3981280073825061, + "grad_norm": 0.8505942225456238, + "learning_rate": 0.00016028499802084708, + "loss": 0.0541, + "step": 1510 + }, + { + "epoch": 0.3994463120427131, + "grad_norm": 0.04163295030593872, + "learning_rate": 0.00016015305449267714, + "loss": 0.0037, + "step": 1515 + }, + { + "epoch": 0.40076461670292, + "grad_norm": 0.011332935653626919, + "learning_rate": 0.00016002111096450721, + "loss": 0.0459, + "step": 1520 + }, + { + "epoch": 0.402082921363127, + "grad_norm": 0.9360129833221436, + "learning_rate": 0.00015988916743633726, + "loss": 0.013, + "step": 1525 + }, + { + "epoch": 0.403401226023334, + "grad_norm": 0.11991436779499054, + "learning_rate": 0.00015975722390816733, + "loss": 0.0079, + "step": 1530 + }, + { + "epoch": 0.40471953068354094, + "grad_norm": 0.36911076307296753, + "learning_rate": 0.00015962528037999737, + "loss": 0.0638, + "step": 1535 + }, + { + "epoch": 0.40603783534374793, + "grad_norm": 0.020278634503483772, + "learning_rate": 0.00015949333685182744, + "loss": 0.0217, + "step": 1540 + }, + { + "epoch": 0.4073561400039549, + "grad_norm": 0.14263059198856354, + "learning_rate": 0.0001593613933236575, + "loss": 0.0495, + "step": 1545 + }, + { + "epoch": 0.40867444466416186, + "grad_norm": 0.09494803845882416, + "learning_rate": 0.00015922944979548752, + "loss": 0.0248, + "step": 1550 + }, + { + "epoch": 0.40999274932436885, + "grad_norm": 0.23064319789409637, + "learning_rate": 0.0001590975062673176, + "loss": 0.0285, + "step": 1555 + }, + { + "epoch": 0.41131105398457585, + "grad_norm": 0.32220256328582764, + "learning_rate": 0.00015896556273914763, + "loss": 0.0537, + "step": 1560 + }, + { + "epoch": 0.4126293586447828, + "grad_norm": 0.41208815574645996, + "learning_rate": 0.0001588336192109777, + "loss": 0.0453, + "step": 1565 + }, + { + "epoch": 0.4139476633049898, + "grad_norm": 0.03775424137711525, + "learning_rate": 0.00015870167568280777, + "loss": 0.0134, + "step": 1570 + }, + { + "epoch": 0.41526596796519677, + "grad_norm": 0.6526333093643188, + "learning_rate": 0.0001585697321546378, + "loss": 0.0329, + "step": 1575 + }, + { + "epoch": 0.4165842726254037, + "grad_norm": 1.001305103302002, + "learning_rate": 0.00015843778862646788, + "loss": 0.0912, + "step": 1580 + }, + { + "epoch": 0.4179025772856107, + "grad_norm": 0.4055219888687134, + "learning_rate": 0.00015830584509829792, + "loss": 0.0519, + "step": 1585 + }, + { + "epoch": 0.4192208819458177, + "grad_norm": 0.035015616565942764, + "learning_rate": 0.000158173901570128, + "loss": 0.0191, + "step": 1590 + }, + { + "epoch": 0.42053918660602463, + "grad_norm": 0.09326844662427902, + "learning_rate": 0.00015804195804195806, + "loss": 0.0106, + "step": 1595 + }, + { + "epoch": 0.4218574912662316, + "grad_norm": 0.06223440542817116, + "learning_rate": 0.0001579100145137881, + "loss": 0.0113, + "step": 1600 + }, + { + "epoch": 0.4231757959264386, + "grad_norm": 0.0625135526061058, + "learning_rate": 0.00015777807098561817, + "loss": 0.0191, + "step": 1605 + }, + { + "epoch": 0.42449410058664555, + "grad_norm": 0.2645983099937439, + "learning_rate": 0.00015764612745744822, + "loss": 0.0829, + "step": 1610 + }, + { + "epoch": 0.42581240524685254, + "grad_norm": 0.009632415138185024, + "learning_rate": 0.00015751418392927829, + "loss": 0.0542, + "step": 1615 + }, + { + "epoch": 0.42713070990705954, + "grad_norm": 0.01979319378733635, + "learning_rate": 0.00015738224040110833, + "loss": 0.0517, + "step": 1620 + }, + { + "epoch": 0.4284490145672665, + "grad_norm": 0.3065454065799713, + "learning_rate": 0.0001572502968729384, + "loss": 0.0738, + "step": 1625 + }, + { + "epoch": 0.42976731922747347, + "grad_norm": 0.09581473469734192, + "learning_rate": 0.00015711835334476847, + "loss": 0.0571, + "step": 1630 + }, + { + "epoch": 0.43108562388768046, + "grad_norm": 0.23746591806411743, + "learning_rate": 0.0001569864098165985, + "loss": 0.0128, + "step": 1635 + }, + { + "epoch": 0.4324039285478874, + "grad_norm": 0.936278760433197, + "learning_rate": 0.00015685446628842858, + "loss": 0.0665, + "step": 1640 + }, + { + "epoch": 0.4337222332080944, + "grad_norm": 0.18487441539764404, + "learning_rate": 0.00015672252276025862, + "loss": 0.0527, + "step": 1645 + }, + { + "epoch": 0.4350405378683014, + "grad_norm": 0.6980624794960022, + "learning_rate": 0.00015659057923208866, + "loss": 0.0613, + "step": 1650 + }, + { + "epoch": 0.4363588425285083, + "grad_norm": 0.4696301221847534, + "learning_rate": 0.00015645863570391873, + "loss": 0.0569, + "step": 1655 + }, + { + "epoch": 0.4376771471887153, + "grad_norm": 0.15083105862140656, + "learning_rate": 0.00015632669217574877, + "loss": 0.0394, + "step": 1660 + }, + { + "epoch": 0.4389954518489223, + "grad_norm": 0.44701239466667175, + "learning_rate": 0.00015619474864757884, + "loss": 0.0494, + "step": 1665 + }, + { + "epoch": 0.44031375650912924, + "grad_norm": 0.07418403029441833, + "learning_rate": 0.00015606280511940888, + "loss": 0.0291, + "step": 1670 + }, + { + "epoch": 0.44163206116933623, + "grad_norm": 0.02311861515045166, + "learning_rate": 0.00015593086159123895, + "loss": 0.0304, + "step": 1675 + }, + { + "epoch": 0.4429503658295432, + "grad_norm": 0.4416038990020752, + "learning_rate": 0.00015579891806306902, + "loss": 0.0176, + "step": 1680 + }, + { + "epoch": 0.44426867048975016, + "grad_norm": 0.5124915242195129, + "learning_rate": 0.00015566697453489906, + "loss": 0.0454, + "step": 1685 + }, + { + "epoch": 0.44558697514995715, + "grad_norm": 0.3159286081790924, + "learning_rate": 0.00015553503100672913, + "loss": 0.047, + "step": 1690 + }, + { + "epoch": 0.44690527981016415, + "grad_norm": 0.032126396894454956, + "learning_rate": 0.00015540308747855918, + "loss": 0.0151, + "step": 1695 + }, + { + "epoch": 0.4482235844703711, + "grad_norm": 0.04663548618555069, + "learning_rate": 0.00015527114395038924, + "loss": 0.0375, + "step": 1700 + }, + { + "epoch": 0.4495418891305781, + "grad_norm": 0.013753900304436684, + "learning_rate": 0.0001551392004222193, + "loss": 0.0485, + "step": 1705 + }, + { + "epoch": 0.45086019379078507, + "grad_norm": 1.9952393770217896, + "learning_rate": 0.00015500725689404936, + "loss": 0.0625, + "step": 1710 + }, + { + "epoch": 0.452178498450992, + "grad_norm": 0.014283270575106144, + "learning_rate": 0.00015487531336587943, + "loss": 0.0037, + "step": 1715 + }, + { + "epoch": 0.453496803111199, + "grad_norm": 0.3897913098335266, + "learning_rate": 0.00015474336983770947, + "loss": 0.0304, + "step": 1720 + }, + { + "epoch": 0.454815107771406, + "grad_norm": 0.3730885684490204, + "learning_rate": 0.00015461142630953954, + "loss": 0.0115, + "step": 1725 + }, + { + "epoch": 0.45613341243161293, + "grad_norm": 0.035858724266290665, + "learning_rate": 0.00015447948278136958, + "loss": 0.0021, + "step": 1730 + }, + { + "epoch": 0.4574517170918199, + "grad_norm": 0.20589517056941986, + "learning_rate": 0.00015434753925319965, + "loss": 0.0132, + "step": 1735 + }, + { + "epoch": 0.4587700217520269, + "grad_norm": 0.004939342383295298, + "learning_rate": 0.00015421559572502972, + "loss": 0.0471, + "step": 1740 + }, + { + "epoch": 0.46008832641223385, + "grad_norm": 0.03493283689022064, + "learning_rate": 0.00015408365219685976, + "loss": 0.0062, + "step": 1745 + }, + { + "epoch": 0.46140663107244084, + "grad_norm": 0.045927103608846664, + "learning_rate": 0.0001539517086686898, + "loss": 0.0283, + "step": 1750 + }, + { + "epoch": 0.46272493573264784, + "grad_norm": 0.012629454955458641, + "learning_rate": 0.00015381976514051984, + "loss": 0.0133, + "step": 1755 + }, + { + "epoch": 0.46404324039285477, + "grad_norm": 0.8001697659492493, + "learning_rate": 0.0001536878216123499, + "loss": 0.0224, + "step": 1760 + }, + { + "epoch": 0.46536154505306176, + "grad_norm": 0.002036362886428833, + "learning_rate": 0.00015355587808417998, + "loss": 0.0066, + "step": 1765 + }, + { + "epoch": 0.46667984971326876, + "grad_norm": 1.0261330604553223, + "learning_rate": 0.00015342393455601002, + "loss": 0.191, + "step": 1770 + }, + { + "epoch": 0.4679981543734757, + "grad_norm": 0.3033429682254791, + "learning_rate": 0.0001532919910278401, + "loss": 0.0222, + "step": 1775 + }, + { + "epoch": 0.4693164590336827, + "grad_norm": 0.36911338567733765, + "learning_rate": 0.00015316004749967014, + "loss": 0.0363, + "step": 1780 + }, + { + "epoch": 0.4706347636938897, + "grad_norm": 0.0406811460852623, + "learning_rate": 0.0001530281039715002, + "loss": 0.0283, + "step": 1785 + }, + { + "epoch": 0.4719530683540966, + "grad_norm": 0.23334211111068726, + "learning_rate": 0.00015289616044333027, + "loss": 0.0274, + "step": 1790 + }, + { + "epoch": 0.4732713730143036, + "grad_norm": 0.013081169687211514, + "learning_rate": 0.00015276421691516032, + "loss": 0.0221, + "step": 1795 + }, + { + "epoch": 0.4745896776745106, + "grad_norm": 0.2480790615081787, + "learning_rate": 0.00015263227338699039, + "loss": 0.019, + "step": 1800 + }, + { + "epoch": 0.47590798233471754, + "grad_norm": 0.0373196005821228, + "learning_rate": 0.00015250032985882043, + "loss": 0.0292, + "step": 1805 + }, + { + "epoch": 0.47722628699492453, + "grad_norm": 0.004609994124621153, + "learning_rate": 0.0001523683863306505, + "loss": 0.0918, + "step": 1810 + }, + { + "epoch": 0.4785445916551315, + "grad_norm": 0.02370987832546234, + "learning_rate": 0.00015223644280248054, + "loss": 0.0462, + "step": 1815 + }, + { + "epoch": 0.47986289631533846, + "grad_norm": 0.05842221528291702, + "learning_rate": 0.0001521044992743106, + "loss": 0.0595, + "step": 1820 + }, + { + "epoch": 0.48118120097554545, + "grad_norm": 0.009685276076197624, + "learning_rate": 0.00015197255574614068, + "loss": 0.0074, + "step": 1825 + }, + { + "epoch": 0.48249950563575245, + "grad_norm": 0.8933250308036804, + "learning_rate": 0.00015184061221797072, + "loss": 0.0757, + "step": 1830 + }, + { + "epoch": 0.4838178102959594, + "grad_norm": 0.07075401395559311, + "learning_rate": 0.0001517086686898008, + "loss": 0.0226, + "step": 1835 + }, + { + "epoch": 0.4851361149561664, + "grad_norm": 0.732706606388092, + "learning_rate": 0.00015157672516163083, + "loss": 0.0161, + "step": 1840 + }, + { + "epoch": 0.48645441961637337, + "grad_norm": 1.1897023916244507, + "learning_rate": 0.0001514447816334609, + "loss": 0.0265, + "step": 1845 + }, + { + "epoch": 0.4877727242765803, + "grad_norm": 0.052572328597307205, + "learning_rate": 0.00015131283810529094, + "loss": 0.0094, + "step": 1850 + }, + { + "epoch": 0.4890910289367873, + "grad_norm": 0.08263898640871048, + "learning_rate": 0.00015118089457712098, + "loss": 0.0631, + "step": 1855 + }, + { + "epoch": 0.4904093335969943, + "grad_norm": 0.03225664421916008, + "learning_rate": 0.00015104895104895105, + "loss": 0.023, + "step": 1860 + }, + { + "epoch": 0.4917276382572012, + "grad_norm": 0.007935039699077606, + "learning_rate": 0.0001509170075207811, + "loss": 0.0039, + "step": 1865 + }, + { + "epoch": 0.4930459429174082, + "grad_norm": 0.00830796267837286, + "learning_rate": 0.00015078506399261116, + "loss": 0.007, + "step": 1870 + }, + { + "epoch": 0.4943642475776152, + "grad_norm": 0.08042234182357788, + "learning_rate": 0.00015065312046444123, + "loss": 0.0366, + "step": 1875 + }, + { + "epoch": 0.49568255223782215, + "grad_norm": 0.009092851541936398, + "learning_rate": 0.00015052117693627128, + "loss": 0.0107, + "step": 1880 + }, + { + "epoch": 0.49700085689802914, + "grad_norm": 0.2674141824245453, + "learning_rate": 0.00015038923340810135, + "loss": 0.0076, + "step": 1885 + }, + { + "epoch": 0.49831916155823613, + "grad_norm": 0.07694366574287415, + "learning_rate": 0.0001502572898799314, + "loss": 0.0252, + "step": 1890 + }, + { + "epoch": 0.49963746621844307, + "grad_norm": 0.5699467062950134, + "learning_rate": 0.00015012534635176146, + "loss": 0.0487, + "step": 1895 + }, + { + "epoch": 0.5009557708786501, + "grad_norm": 0.18800878524780273, + "learning_rate": 0.0001499934028235915, + "loss": 0.0183, + "step": 1900 + }, + { + "epoch": 0.5022740755388571, + "grad_norm": 0.019469989463686943, + "learning_rate": 0.00014986145929542157, + "loss": 0.0268, + "step": 1905 + }, + { + "epoch": 0.503592380199064, + "grad_norm": 0.01890506222844124, + "learning_rate": 0.00014972951576725164, + "loss": 0.0449, + "step": 1910 + }, + { + "epoch": 0.5049106848592709, + "grad_norm": 0.0006314461352303624, + "learning_rate": 0.00014959757223908168, + "loss": 0.0056, + "step": 1915 + }, + { + "epoch": 0.5062289895194779, + "grad_norm": 0.32654041051864624, + "learning_rate": 0.00014946562871091175, + "loss": 0.0256, + "step": 1920 + }, + { + "epoch": 0.5075472941796849, + "grad_norm": 0.7803483605384827, + "learning_rate": 0.0001493336851827418, + "loss": 0.0374, + "step": 1925 + }, + { + "epoch": 0.5088655988398919, + "grad_norm": 0.028441445901989937, + "learning_rate": 0.00014920174165457186, + "loss": 0.0161, + "step": 1930 + }, + { + "epoch": 0.5101839035000989, + "grad_norm": 0.028379200026392937, + "learning_rate": 0.00014906979812640193, + "loss": 0.0151, + "step": 1935 + }, + { + "epoch": 0.5115022081603059, + "grad_norm": 0.021159596741199493, + "learning_rate": 0.00014893785459823197, + "loss": 0.0303, + "step": 1940 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.24903325736522675, + "learning_rate": 0.000148805911070062, + "loss": 0.0076, + "step": 1945 + }, + { + "epoch": 0.5141388174807198, + "grad_norm": 0.007065301761031151, + "learning_rate": 0.00014867396754189206, + "loss": 0.022, + "step": 1950 + }, + { + "epoch": 0.5154571221409268, + "grad_norm": 0.004032329190522432, + "learning_rate": 0.00014854202401372212, + "loss": 0.0083, + "step": 1955 + }, + { + "epoch": 0.5167754268011338, + "grad_norm": 0.3045775592327118, + "learning_rate": 0.0001484100804855522, + "loss": 0.0113, + "step": 1960 + }, + { + "epoch": 0.5180937314613407, + "grad_norm": 0.36974939703941345, + "learning_rate": 0.00014827813695738224, + "loss": 0.0267, + "step": 1965 + }, + { + "epoch": 0.5194120361215477, + "grad_norm": 0.009729950688779354, + "learning_rate": 0.0001481461934292123, + "loss": 0.027, + "step": 1970 + }, + { + "epoch": 0.5207303407817546, + "grad_norm": 0.0013097926275804639, + "learning_rate": 0.00014801424990104235, + "loss": 0.003, + "step": 1975 + }, + { + "epoch": 0.5220486454419616, + "grad_norm": 0.0706263929605484, + "learning_rate": 0.00014788230637287242, + "loss": 0.0193, + "step": 1980 + }, + { + "epoch": 0.5233669501021686, + "grad_norm": 1.435702919960022, + "learning_rate": 0.00014775036284470249, + "loss": 0.0647, + "step": 1985 + }, + { + "epoch": 0.5246852547623756, + "grad_norm": 0.00661757867783308, + "learning_rate": 0.00014761841931653253, + "loss": 0.0373, + "step": 1990 + }, + { + "epoch": 0.5260035594225826, + "grad_norm": 0.12014541029930115, + "learning_rate": 0.0001474864757883626, + "loss": 0.0178, + "step": 1995 + }, + { + "epoch": 0.5273218640827896, + "grad_norm": 1.0549248456954956, + "learning_rate": 0.00014735453226019264, + "loss": 0.0191, + "step": 2000 + }, + { + "epoch": 0.5273218640827896, + "eval_loss": 0.037292081862688065, + "eval_runtime": 454.3033, + "eval_samples_per_second": 7.422, + "eval_steps_per_second": 3.711, + "step": 2000 + } + ], + "logging_steps": 5, + "max_steps": 7584, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.6140463556817715e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2000/training_args.bin b/checkpoint-2000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4c05e0585b1aef03c0cbe4c50207ce04940bd838 --- /dev/null +++ b/checkpoint-2000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79dfa687fdd0c9908ab6b63535817e7567b29b0b483ac228723218f6f5fdeec5 +size 5688 diff --git a/checkpoint-2500/README.md b/checkpoint-2500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e738b5eda9883f4a45efd9d37b8b8357ba758456 --- /dev/null +++ b/checkpoint-2500/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/Llama-3.2-3B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-2500/adapter_config.json b/checkpoint-2500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..852d02d4dc1204f4332be8b3363041a3d3e3feb3 --- /dev/null +++ b/checkpoint-2500/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/Llama-3.2-3B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "gate_proj", + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2500/adapter_model.safetensors b/checkpoint-2500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6fa1e391523015357560511ad62b54fab4de2b2a --- /dev/null +++ b/checkpoint-2500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a6211c65c5d4a0068d229acf0a297a352ad92f0b64f5164e0d585aee56e6e6d +size 97307544 diff --git a/checkpoint-2500/optimizer.pt b/checkpoint-2500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c8a4922d13d67fd769daddd0a76705e9688d8fe --- /dev/null +++ b/checkpoint-2500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f0c2fb708dde05534fb432625f17830675269b0ce06972aa78dfd143c6fdea2 +size 50866370 diff --git a/checkpoint-2500/rng_state.pth b/checkpoint-2500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3ae6a8030e2eeddac7b3375dc13d67509075b8cb --- /dev/null +++ b/checkpoint-2500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:301bae54f3043b54f78ee582aa05b8bf01996d027ad704f9a95f10d8be516262 +size 14244 diff --git a/checkpoint-2500/scheduler.pt b/checkpoint-2500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3c233c3fbc5176f131c153028d5f72c5bb54a01 --- /dev/null +++ b/checkpoint-2500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb2463bd0f8eade5c4e011fd3af237c0b30fc7086ad97457f073fe0018b5d34c +size 1064 diff --git a/checkpoint-2500/special_tokens_map.json b/checkpoint-2500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-2500/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-2500/tokenizer.json b/checkpoint-2500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-2500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-2500/tokenizer_config.json b/checkpoint-2500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfd835b0ef1033e89629af6e13ae45397e9fa2f8 --- /dev/null +++ b/checkpoint-2500/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 July 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-2500/trainer_state.json b/checkpoint-2500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c3abf305f7cac3ced835323ff637904cdfc5f4e8 --- /dev/null +++ b/checkpoint-2500/trainer_state.json @@ -0,0 +1,3576 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.659152330103487, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001318304660206974, + "grad_norm": 4.59375, + "learning_rate": 0.0002, + "loss": 1.9624, + "step": 5 + }, + { + "epoch": 0.002636609320413948, + "grad_norm": 1.7421875, + "learning_rate": 0.00019986805647183008, + "loss": 0.6513, + "step": 10 + }, + { + "epoch": 0.003954913980620921, + "grad_norm": 1.84375, + "learning_rate": 0.00019973611294366012, + "loss": 0.1146, + "step": 15 + }, + { + "epoch": 0.005273218640827896, + "grad_norm": 1.3203125, + "learning_rate": 0.0001996041694154902, + "loss": 0.0529, + "step": 20 + }, + { + "epoch": 0.006591523301034869, + "grad_norm": 0.40234375, + "learning_rate": 0.00019947222588732023, + "loss": 0.1214, + "step": 25 + }, + { + "epoch": 0.007909827961241843, + "grad_norm": 1.5390625, + "learning_rate": 0.0001993402823591503, + "loss": 0.0919, + "step": 30 + }, + { + "epoch": 0.009228132621448816, + "grad_norm": 0.06201171875, + "learning_rate": 0.00019920833883098034, + "loss": 0.09, + "step": 35 + }, + { + "epoch": 0.010546437281655791, + "grad_norm": 1.53125, + "learning_rate": 0.0001990763953028104, + "loss": 0.1945, + "step": 40 + }, + { + "epoch": 0.011864741941862765, + "grad_norm": 0.2890625, + "learning_rate": 0.00019894445177464048, + "loss": 0.1259, + "step": 45 + }, + { + "epoch": 0.013183046602069738, + "grad_norm": 0.609375, + "learning_rate": 0.00019881250824647052, + "loss": 0.027, + "step": 50 + }, + { + "epoch": 0.014501351262276712, + "grad_norm": 0.369140625, + "learning_rate": 0.00019868056471830057, + "loss": 0.1068, + "step": 55 + }, + { + "epoch": 0.015819655922483685, + "grad_norm": 0.34765625, + "learning_rate": 0.00019854862119013064, + "loss": 0.0542, + "step": 60 + }, + { + "epoch": 0.01713796058269066, + "grad_norm": 0.055419921875, + "learning_rate": 0.00019841667766196068, + "loss": 0.0901, + "step": 65 + }, + { + "epoch": 0.018456265242897632, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00019828473413379075, + "loss": 0.0091, + "step": 70 + }, + { + "epoch": 0.019774569903104607, + "grad_norm": 0.0079345703125, + "learning_rate": 0.0001981527906056208, + "loss": 0.0744, + "step": 75 + }, + { + "epoch": 0.021092874563311582, + "grad_norm": 0.65234375, + "learning_rate": 0.00019802084707745086, + "loss": 0.1108, + "step": 80 + }, + { + "epoch": 0.022411179223518554, + "grad_norm": 0.50390625, + "learning_rate": 0.0001978889035492809, + "loss": 0.0446, + "step": 85 + }, + { + "epoch": 0.02372948388372553, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019775696002111097, + "loss": 0.0982, + "step": 90 + }, + { + "epoch": 0.0250477885439325, + "grad_norm": 0.490234375, + "learning_rate": 0.00019762501649294104, + "loss": 0.1035, + "step": 95 + }, + { + "epoch": 0.026366093204139476, + "grad_norm": 0.12158203125, + "learning_rate": 0.00019749307296477108, + "loss": 0.0401, + "step": 100 + }, + { + "epoch": 0.02768439786434645, + "grad_norm": 0.16015625, + "learning_rate": 0.00019736112943660115, + "loss": 0.0309, + "step": 105 + }, + { + "epoch": 0.029002702524553423, + "grad_norm": 1.359375, + "learning_rate": 0.0001972291859084312, + "loss": 0.1032, + "step": 110 + }, + { + "epoch": 0.0303210071847604, + "grad_norm": 0.52734375, + "learning_rate": 0.00019709724238026126, + "loss": 0.0811, + "step": 115 + }, + { + "epoch": 0.03163931184496737, + "grad_norm": 0.177734375, + "learning_rate": 0.00019696529885209133, + "loss": 0.0258, + "step": 120 + }, + { + "epoch": 0.03295761650517435, + "grad_norm": 0.234375, + "learning_rate": 0.00019683335532392137, + "loss": 0.0437, + "step": 125 + }, + { + "epoch": 0.03427592116538132, + "grad_norm": 1.3046875, + "learning_rate": 0.00019670141179575144, + "loss": 0.0967, + "step": 130 + }, + { + "epoch": 0.03559422582558829, + "grad_norm": 0.2734375, + "learning_rate": 0.00019656946826758148, + "loss": 0.0132, + "step": 135 + }, + { + "epoch": 0.036912530485795264, + "grad_norm": 0.66015625, + "learning_rate": 0.00019643752473941155, + "loss": 0.0396, + "step": 140 + }, + { + "epoch": 0.03823083514600224, + "grad_norm": 1.0546875, + "learning_rate": 0.0001963055812112416, + "loss": 0.0449, + "step": 145 + }, + { + "epoch": 0.039549139806209214, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019617363768307166, + "loss": 0.1196, + "step": 150 + }, + { + "epoch": 0.040867444466416186, + "grad_norm": 0.5859375, + "learning_rate": 0.0001960416941549017, + "loss": 0.0588, + "step": 155 + }, + { + "epoch": 0.042185749126623165, + "grad_norm": 0.06005859375, + "learning_rate": 0.00019590975062673175, + "loss": 0.0234, + "step": 160 + }, + { + "epoch": 0.04350405378683014, + "grad_norm": 0.4921875, + "learning_rate": 0.00019577780709856182, + "loss": 0.0916, + "step": 165 + }, + { + "epoch": 0.04482235844703711, + "grad_norm": 0.84375, + "learning_rate": 0.0001956458635703919, + "loss": 0.0271, + "step": 170 + }, + { + "epoch": 0.04614066310724409, + "grad_norm": 0.8828125, + "learning_rate": 0.00019551392004222193, + "loss": 0.0175, + "step": 175 + }, + { + "epoch": 0.04745896776745106, + "grad_norm": 0.0152587890625, + "learning_rate": 0.000195381976514052, + "loss": 0.0356, + "step": 180 + }, + { + "epoch": 0.04877727242765803, + "grad_norm": 0.09326171875, + "learning_rate": 0.00019525003298588204, + "loss": 0.0057, + "step": 185 + }, + { + "epoch": 0.050095577087865, + "grad_norm": 0.24609375, + "learning_rate": 0.0001951180894577121, + "loss": 0.0082, + "step": 190 + }, + { + "epoch": 0.05141388174807198, + "grad_norm": 0.05029296875, + "learning_rate": 0.00019498614592954215, + "loss": 0.0178, + "step": 195 + }, + { + "epoch": 0.05273218640827895, + "grad_norm": 0.0390625, + "learning_rate": 0.00019485420240137222, + "loss": 0.0789, + "step": 200 + }, + { + "epoch": 0.054050491068485924, + "grad_norm": 0.5625, + "learning_rate": 0.0001947222588732023, + "loss": 0.0645, + "step": 205 + }, + { + "epoch": 0.0553687957286929, + "grad_norm": 0.53515625, + "learning_rate": 0.00019459031534503233, + "loss": 0.116, + "step": 210 + }, + { + "epoch": 0.056687100388899875, + "grad_norm": 0.55078125, + "learning_rate": 0.0001944583718168624, + "loss": 0.0516, + "step": 215 + }, + { + "epoch": 0.058005405049106847, + "grad_norm": 0.314453125, + "learning_rate": 0.00019432642828869244, + "loss": 0.1019, + "step": 220 + }, + { + "epoch": 0.059323709709313825, + "grad_norm": 0.1123046875, + "learning_rate": 0.0001941944847605225, + "loss": 0.0529, + "step": 225 + }, + { + "epoch": 0.0606420143695208, + "grad_norm": 0.4921875, + "learning_rate": 0.00019406254123235256, + "loss": 0.0368, + "step": 230 + }, + { + "epoch": 0.06196031902972777, + "grad_norm": 0.054443359375, + "learning_rate": 0.00019393059770418262, + "loss": 0.037, + "step": 235 + }, + { + "epoch": 0.06327862368993474, + "grad_norm": 0.008544921875, + "learning_rate": 0.0001937986541760127, + "loss": 0.0324, + "step": 240 + }, + { + "epoch": 0.06459692835014172, + "grad_norm": 1.5, + "learning_rate": 0.00019366671064784274, + "loss": 0.0334, + "step": 245 + }, + { + "epoch": 0.0659152330103487, + "grad_norm": 0.2109375, + "learning_rate": 0.0001935347671196728, + "loss": 0.0671, + "step": 250 + }, + { + "epoch": 0.06723353767055566, + "grad_norm": 2.0625, + "learning_rate": 0.00019340282359150285, + "loss": 0.1559, + "step": 255 + }, + { + "epoch": 0.06855184233076264, + "grad_norm": 0.7734375, + "learning_rate": 0.0001932708800633329, + "loss": 0.0198, + "step": 260 + }, + { + "epoch": 0.06987014699096962, + "grad_norm": 0.42578125, + "learning_rate": 0.00019313893653516296, + "loss": 0.0151, + "step": 265 + }, + { + "epoch": 0.07118845165117658, + "grad_norm": 0.1884765625, + "learning_rate": 0.000193006993006993, + "loss": 0.0269, + "step": 270 + }, + { + "epoch": 0.07250675631138356, + "grad_norm": 1.546875, + "learning_rate": 0.00019287504947882307, + "loss": 0.0565, + "step": 275 + }, + { + "epoch": 0.07382506097159053, + "grad_norm": 0.5078125, + "learning_rate": 0.0001927431059506531, + "loss": 0.0942, + "step": 280 + }, + { + "epoch": 0.0751433656317975, + "grad_norm": 0.392578125, + "learning_rate": 0.00019261116242248318, + "loss": 0.0061, + "step": 285 + }, + { + "epoch": 0.07646167029200449, + "grad_norm": 1.9140625, + "learning_rate": 0.00019247921889431325, + "loss": 0.0497, + "step": 290 + }, + { + "epoch": 0.07777997495221145, + "grad_norm": 0.08837890625, + "learning_rate": 0.0001923472753661433, + "loss": 0.0573, + "step": 295 + }, + { + "epoch": 0.07909827961241843, + "grad_norm": 1.046875, + "learning_rate": 0.00019221533183797336, + "loss": 0.0528, + "step": 300 + }, + { + "epoch": 0.08041658427262541, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001920833883098034, + "loss": 0.0506, + "step": 305 + }, + { + "epoch": 0.08173488893283237, + "grad_norm": 0.08203125, + "learning_rate": 0.00019195144478163347, + "loss": 0.0307, + "step": 310 + }, + { + "epoch": 0.08305319359303935, + "grad_norm": 0.111328125, + "learning_rate": 0.00019181950125346354, + "loss": 0.0365, + "step": 315 + }, + { + "epoch": 0.08437149825324633, + "grad_norm": 1.2890625, + "learning_rate": 0.00019168755772529358, + "loss": 0.0447, + "step": 320 + }, + { + "epoch": 0.0856898029134533, + "grad_norm": 0.6015625, + "learning_rate": 0.00019155561419712365, + "loss": 0.0605, + "step": 325 + }, + { + "epoch": 0.08700810757366027, + "grad_norm": 0.71875, + "learning_rate": 0.0001914236706689537, + "loss": 0.0846, + "step": 330 + }, + { + "epoch": 0.08832641223386725, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019129172714078376, + "loss": 0.0713, + "step": 335 + }, + { + "epoch": 0.08964471689407422, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001911597836126138, + "loss": 0.0826, + "step": 340 + }, + { + "epoch": 0.0909630215542812, + "grad_norm": 2.203125, + "learning_rate": 0.00019102784008444388, + "loss": 0.0441, + "step": 345 + }, + { + "epoch": 0.09228132621448817, + "grad_norm": 1.21875, + "learning_rate": 0.00019089589655627395, + "loss": 0.1378, + "step": 350 + }, + { + "epoch": 0.09359963087469514, + "grad_norm": 3.0625, + "learning_rate": 0.00019076395302810396, + "loss": 0.1552, + "step": 355 + }, + { + "epoch": 0.09491793553490212, + "grad_norm": 0.232421875, + "learning_rate": 0.00019063200949993403, + "loss": 0.0458, + "step": 360 + }, + { + "epoch": 0.0962362401951091, + "grad_norm": 0.71875, + "learning_rate": 0.0001905000659717641, + "loss": 0.0312, + "step": 365 + }, + { + "epoch": 0.09755454485531606, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00019036812244359414, + "loss": 0.0247, + "step": 370 + }, + { + "epoch": 0.09887284951552304, + "grad_norm": 0.064453125, + "learning_rate": 0.0001902361789154242, + "loss": 0.054, + "step": 375 + }, + { + "epoch": 0.10019115417573, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019010423538725425, + "loss": 0.0023, + "step": 380 + }, + { + "epoch": 0.10150945883593698, + "grad_norm": 0.0361328125, + "learning_rate": 0.00018997229185908432, + "loss": 0.0884, + "step": 385 + }, + { + "epoch": 0.10282776349614396, + "grad_norm": 1.703125, + "learning_rate": 0.00018984034833091436, + "loss": 0.0506, + "step": 390 + }, + { + "epoch": 0.10414606815635093, + "grad_norm": 0.08837890625, + "learning_rate": 0.00018970840480274443, + "loss": 0.1123, + "step": 395 + }, + { + "epoch": 0.1054643728165579, + "grad_norm": 0.6953125, + "learning_rate": 0.0001895764612745745, + "loss": 0.0597, + "step": 400 + }, + { + "epoch": 0.10678267747676488, + "grad_norm": 0.18359375, + "learning_rate": 0.00018944451774640454, + "loss": 0.0138, + "step": 405 + }, + { + "epoch": 0.10810098213697185, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001893125742182346, + "loss": 0.0249, + "step": 410 + }, + { + "epoch": 0.10941928679717883, + "grad_norm": 0.00970458984375, + "learning_rate": 0.00018918063069006466, + "loss": 0.0084, + "step": 415 + }, + { + "epoch": 0.1107375914573858, + "grad_norm": 0.54296875, + "learning_rate": 0.00018904868716189472, + "loss": 0.0541, + "step": 420 + }, + { + "epoch": 0.11205589611759277, + "grad_norm": 0.74609375, + "learning_rate": 0.00018891674363372477, + "loss": 0.007, + "step": 425 + }, + { + "epoch": 0.11337420077779975, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00018878480010555484, + "loss": 0.0875, + "step": 430 + }, + { + "epoch": 0.11469250543800673, + "grad_norm": 0.9296875, + "learning_rate": 0.0001886528565773849, + "loss": 0.1207, + "step": 435 + }, + { + "epoch": 0.11601081009821369, + "grad_norm": 1.2734375, + "learning_rate": 0.00018852091304921495, + "loss": 0.1143, + "step": 440 + }, + { + "epoch": 0.11732911475842067, + "grad_norm": 0.6484375, + "learning_rate": 0.00018838896952104502, + "loss": 0.0393, + "step": 445 + }, + { + "epoch": 0.11864741941862765, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018825702599287506, + "loss": 0.02, + "step": 450 + }, + { + "epoch": 0.11996572407883462, + "grad_norm": 0.486328125, + "learning_rate": 0.0001881250824647051, + "loss": 0.0891, + "step": 455 + }, + { + "epoch": 0.1212840287390416, + "grad_norm": 1.0, + "learning_rate": 0.00018799313893653517, + "loss": 0.0469, + "step": 460 + }, + { + "epoch": 0.12260233339924857, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001878611954083652, + "loss": 0.019, + "step": 465 + }, + { + "epoch": 0.12392063805945554, + "grad_norm": 0.03857421875, + "learning_rate": 0.00018772925188019528, + "loss": 0.007, + "step": 470 + }, + { + "epoch": 0.12523894271966252, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00018759730835202532, + "loss": 0.0039, + "step": 475 + }, + { + "epoch": 0.12655724737986948, + "grad_norm": 0.014404296875, + "learning_rate": 0.0001874653648238554, + "loss": 0.0043, + "step": 480 + }, + { + "epoch": 0.12787555204007647, + "grad_norm": 0.51953125, + "learning_rate": 0.00018733342129568546, + "loss": 0.1326, + "step": 485 + }, + { + "epoch": 0.12919385670028344, + "grad_norm": 0.99609375, + "learning_rate": 0.0001872014777675155, + "loss": 0.0369, + "step": 490 + }, + { + "epoch": 0.1305121613604904, + "grad_norm": 0.2734375, + "learning_rate": 0.00018706953423934557, + "loss": 0.0395, + "step": 495 + }, + { + "epoch": 0.1318304660206974, + "grad_norm": 0.083984375, + "learning_rate": 0.00018693759071117561, + "loss": 0.0284, + "step": 500 + }, + { + "epoch": 0.1318304660206974, + "eval_loss": 0.04542969539761543, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.5293, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 3.685, + "step": 500 + }, + { + "epoch": 0.13314877068090436, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00018680564718300568, + "loss": 0.0533, + "step": 505 + }, + { + "epoch": 0.13446707534111133, + "grad_norm": 0.71484375, + "learning_rate": 0.00018667370365483575, + "loss": 0.0183, + "step": 510 + }, + { + "epoch": 0.13578538000131832, + "grad_norm": 0.018798828125, + "learning_rate": 0.0001865417601266658, + "loss": 0.0473, + "step": 515 + }, + { + "epoch": 0.13710368466152528, + "grad_norm": 0.388671875, + "learning_rate": 0.00018640981659849586, + "loss": 0.0562, + "step": 520 + }, + { + "epoch": 0.13842198932173225, + "grad_norm": 0.77734375, + "learning_rate": 0.0001862778730703259, + "loss": 0.0755, + "step": 525 + }, + { + "epoch": 0.13974029398193924, + "grad_norm": 2.8125, + "learning_rate": 0.00018614592954215598, + "loss": 0.0422, + "step": 530 + }, + { + "epoch": 0.1410585986421462, + "grad_norm": 0.48828125, + "learning_rate": 0.00018601398601398602, + "loss": 0.0882, + "step": 535 + }, + { + "epoch": 0.14237690330235317, + "grad_norm": 0.16015625, + "learning_rate": 0.0001858820424858161, + "loss": 0.0131, + "step": 540 + }, + { + "epoch": 0.14369520796256013, + "grad_norm": 0.31640625, + "learning_rate": 0.00018575009895764616, + "loss": 0.03, + "step": 545 + }, + { + "epoch": 0.14501351262276713, + "grad_norm": 0.0120849609375, + "learning_rate": 0.0001856181554294762, + "loss": 0.0425, + "step": 550 + }, + { + "epoch": 0.1463318172829741, + "grad_norm": 0.390625, + "learning_rate": 0.00018548621190130624, + "loss": 0.011, + "step": 555 + }, + { + "epoch": 0.14765012194318106, + "grad_norm": 1.9609375, + "learning_rate": 0.0001853542683731363, + "loss": 0.0807, + "step": 560 + }, + { + "epoch": 0.14896842660338805, + "grad_norm": 0.609375, + "learning_rate": 0.00018522232484496635, + "loss": 0.0278, + "step": 565 + }, + { + "epoch": 0.150286731263595, + "grad_norm": 0.087890625, + "learning_rate": 0.00018509038131679642, + "loss": 0.0484, + "step": 570 + }, + { + "epoch": 0.15160503592380198, + "grad_norm": 0.5078125, + "learning_rate": 0.00018495843778862646, + "loss": 0.1277, + "step": 575 + }, + { + "epoch": 0.15292334058400897, + "grad_norm": 0.8125, + "learning_rate": 0.00018482649426045653, + "loss": 0.058, + "step": 580 + }, + { + "epoch": 0.15424164524421594, + "grad_norm": 0.22265625, + "learning_rate": 0.00018469455073228657, + "loss": 0.0259, + "step": 585 + }, + { + "epoch": 0.1555599499044229, + "grad_norm": 1.8984375, + "learning_rate": 0.00018456260720411664, + "loss": 0.113, + "step": 590 + }, + { + "epoch": 0.1568782545646299, + "grad_norm": 0.12451171875, + "learning_rate": 0.0001844306636759467, + "loss": 0.0312, + "step": 595 + }, + { + "epoch": 0.15819655922483686, + "grad_norm": 0.0322265625, + "learning_rate": 0.00018429872014777676, + "loss": 0.0476, + "step": 600 + }, + { + "epoch": 0.15951486388504382, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00018416677661960682, + "loss": 0.0232, + "step": 605 + }, + { + "epoch": 0.16083316854525082, + "grad_norm": 0.57421875, + "learning_rate": 0.00018403483309143687, + "loss": 0.1287, + "step": 610 + }, + { + "epoch": 0.16215147320545778, + "grad_norm": 0.765625, + "learning_rate": 0.00018390288956326694, + "loss": 0.0991, + "step": 615 + }, + { + "epoch": 0.16346977786566474, + "grad_norm": 0.3125, + "learning_rate": 0.00018377094603509698, + "loss": 0.0247, + "step": 620 + }, + { + "epoch": 0.16478808252587174, + "grad_norm": 0.37890625, + "learning_rate": 0.00018363900250692705, + "loss": 0.0632, + "step": 625 + }, + { + "epoch": 0.1661063871860787, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018350705897875712, + "loss": 0.0314, + "step": 630 + }, + { + "epoch": 0.16742469184628567, + "grad_norm": 0.0673828125, + "learning_rate": 0.00018337511545058716, + "loss": 0.0425, + "step": 635 + }, + { + "epoch": 0.16874299650649266, + "grad_norm": 0.396484375, + "learning_rate": 0.00018324317192241723, + "loss": 0.0613, + "step": 640 + }, + { + "epoch": 0.17006130116669962, + "grad_norm": 0.057373046875, + "learning_rate": 0.00018311122839424727, + "loss": 0.0569, + "step": 645 + }, + { + "epoch": 0.1713796058269066, + "grad_norm": 0.001373291015625, + "learning_rate": 0.00018297928486607734, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 0.17269791048711358, + "grad_norm": 1.0859375, + "learning_rate": 0.00018284734133790738, + "loss": 0.0189, + "step": 655 + }, + { + "epoch": 0.17401621514732055, + "grad_norm": 0.6015625, + "learning_rate": 0.00018271539780973742, + "loss": 0.0601, + "step": 660 + }, + { + "epoch": 0.1753345198075275, + "grad_norm": 0.25390625, + "learning_rate": 0.0001825834542815675, + "loss": 0.0211, + "step": 665 + }, + { + "epoch": 0.1766528244677345, + "grad_norm": 2.6875, + "learning_rate": 0.00018245151075339753, + "loss": 0.0713, + "step": 670 + }, + { + "epoch": 0.17797112912794147, + "grad_norm": 1.1875, + "learning_rate": 0.0001823195672252276, + "loss": 0.0522, + "step": 675 + }, + { + "epoch": 0.17928943378814843, + "grad_norm": 0.025146484375, + "learning_rate": 0.00018218762369705767, + "loss": 0.0242, + "step": 680 + }, + { + "epoch": 0.18060773844835543, + "grad_norm": 0.048095703125, + "learning_rate": 0.00018205568016888772, + "loss": 0.0129, + "step": 685 + }, + { + "epoch": 0.1819260431085624, + "grad_norm": 0.04541015625, + "learning_rate": 0.00018192373664071778, + "loss": 0.0142, + "step": 690 + }, + { + "epoch": 0.18324434776876936, + "grad_norm": 0.00830078125, + "learning_rate": 0.00018179179311254783, + "loss": 0.0121, + "step": 695 + }, + { + "epoch": 0.18456265242897635, + "grad_norm": 0.53125, + "learning_rate": 0.0001816598495843779, + "loss": 0.0163, + "step": 700 + }, + { + "epoch": 0.1858809570891833, + "grad_norm": 0.185546875, + "learning_rate": 0.00018152790605620796, + "loss": 0.0203, + "step": 705 + }, + { + "epoch": 0.18719926174939028, + "grad_norm": 1.2578125, + "learning_rate": 0.000181395962528038, + "loss": 0.1548, + "step": 710 + }, + { + "epoch": 0.18851756640959727, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00018126401899986808, + "loss": 0.0543, + "step": 715 + }, + { + "epoch": 0.18983587106980424, + "grad_norm": 0.07568359375, + "learning_rate": 0.00018113207547169812, + "loss": 0.0346, + "step": 720 + }, + { + "epoch": 0.1911541757300112, + "grad_norm": 0.1318359375, + "learning_rate": 0.0001810001319435282, + "loss": 0.03, + "step": 725 + }, + { + "epoch": 0.1924724803902182, + "grad_norm": 0.1455078125, + "learning_rate": 0.00018086818841535823, + "loss": 0.0796, + "step": 730 + }, + { + "epoch": 0.19379078505042516, + "grad_norm": 0.09814453125, + "learning_rate": 0.0001807362448871883, + "loss": 0.0662, + "step": 735 + }, + { + "epoch": 0.19510908971063212, + "grad_norm": 0.91015625, + "learning_rate": 0.00018060430135901837, + "loss": 0.0675, + "step": 740 + }, + { + "epoch": 0.19642739437083911, + "grad_norm": 0.10693359375, + "learning_rate": 0.0001804723578308484, + "loss": 0.0377, + "step": 745 + }, + { + "epoch": 0.19774569903104608, + "grad_norm": 0.95703125, + "learning_rate": 0.00018034041430267848, + "loss": 0.0174, + "step": 750 + }, + { + "epoch": 0.19906400369125304, + "grad_norm": 1.7890625, + "learning_rate": 0.00018020847077450852, + "loss": 0.0278, + "step": 755 + }, + { + "epoch": 0.20038230835146, + "grad_norm": 0.8515625, + "learning_rate": 0.00018007652724633856, + "loss": 0.0113, + "step": 760 + }, + { + "epoch": 0.201700613011667, + "grad_norm": 0.016845703125, + "learning_rate": 0.00017994458371816863, + "loss": 0.0589, + "step": 765 + }, + { + "epoch": 0.20301891767187397, + "grad_norm": 0.01043701171875, + "learning_rate": 0.00017981264018999867, + "loss": 0.0203, + "step": 770 + }, + { + "epoch": 0.20433722233208093, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00017968069666182874, + "loss": 0.0494, + "step": 775 + }, + { + "epoch": 0.20565552699228792, + "grad_norm": 0.56640625, + "learning_rate": 0.00017954875313365879, + "loss": 0.0394, + "step": 780 + }, + { + "epoch": 0.2069738316524949, + "grad_norm": 0.06591796875, + "learning_rate": 0.00017941680960548886, + "loss": 0.0848, + "step": 785 + }, + { + "epoch": 0.20829213631270185, + "grad_norm": 0.40234375, + "learning_rate": 0.00017928486607731892, + "loss": 0.0464, + "step": 790 + }, + { + "epoch": 0.20961044097290885, + "grad_norm": 0.06298828125, + "learning_rate": 0.00017915292254914897, + "loss": 0.0222, + "step": 795 + }, + { + "epoch": 0.2109287456331158, + "grad_norm": 0.5390625, + "learning_rate": 0.00017902097902097904, + "loss": 0.0434, + "step": 800 + }, + { + "epoch": 0.21224705029332278, + "grad_norm": 1.390625, + "learning_rate": 0.00017888903549280908, + "loss": 0.0222, + "step": 805 + }, + { + "epoch": 0.21356535495352977, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00017875709196463915, + "loss": 0.0099, + "step": 810 + }, + { + "epoch": 0.21488365961373673, + "grad_norm": 0.10009765625, + "learning_rate": 0.0001786251484364692, + "loss": 0.0086, + "step": 815 + }, + { + "epoch": 0.2162019642739437, + "grad_norm": 0.06396484375, + "learning_rate": 0.00017849320490829926, + "loss": 0.0715, + "step": 820 + }, + { + "epoch": 0.2175202689341507, + "grad_norm": 0.365234375, + "learning_rate": 0.00017836126138012933, + "loss": 0.0642, + "step": 825 + }, + { + "epoch": 0.21883857359435765, + "grad_norm": 0.01519775390625, + "learning_rate": 0.00017822931785195937, + "loss": 0.0111, + "step": 830 + }, + { + "epoch": 0.22015687825456462, + "grad_norm": 1.1640625, + "learning_rate": 0.00017809737432378944, + "loss": 0.0518, + "step": 835 + }, + { + "epoch": 0.2214751829147716, + "grad_norm": 0.00921630859375, + "learning_rate": 0.00017796543079561948, + "loss": 0.0384, + "step": 840 + }, + { + "epoch": 0.22279348757497858, + "grad_norm": 0.33984375, + "learning_rate": 0.00017783348726744955, + "loss": 0.0204, + "step": 845 + }, + { + "epoch": 0.22411179223518554, + "grad_norm": 0.294921875, + "learning_rate": 0.00017770154373927962, + "loss": 0.0075, + "step": 850 + }, + { + "epoch": 0.22543009689539253, + "grad_norm": 0.033203125, + "learning_rate": 0.00017756960021110963, + "loss": 0.0895, + "step": 855 + }, + { + "epoch": 0.2267484015555995, + "grad_norm": 0.08056640625, + "learning_rate": 0.0001774376566829397, + "loss": 0.1039, + "step": 860 + }, + { + "epoch": 0.22806670621580646, + "grad_norm": 0.55078125, + "learning_rate": 0.00017730571315476975, + "loss": 0.0125, + "step": 865 + }, + { + "epoch": 0.22938501087601346, + "grad_norm": 0.5859375, + "learning_rate": 0.00017717376962659982, + "loss": 0.0381, + "step": 870 + }, + { + "epoch": 0.23070331553622042, + "grad_norm": 0.029052734375, + "learning_rate": 0.00017704182609842988, + "loss": 0.0434, + "step": 875 + }, + { + "epoch": 0.23202162019642739, + "grad_norm": 0.43359375, + "learning_rate": 0.00017690988257025993, + "loss": 0.0799, + "step": 880 + }, + { + "epoch": 0.23333992485663438, + "grad_norm": 0.04150390625, + "learning_rate": 0.00017677793904209, + "loss": 0.0692, + "step": 885 + }, + { + "epoch": 0.23465822951684134, + "grad_norm": 0.435546875, + "learning_rate": 0.00017664599551392004, + "loss": 0.0544, + "step": 890 + }, + { + "epoch": 0.2359765341770483, + "grad_norm": 1.171875, + "learning_rate": 0.0001765140519857501, + "loss": 0.0619, + "step": 895 + }, + { + "epoch": 0.2372948388372553, + "grad_norm": 0.01263427734375, + "learning_rate": 0.00017638210845758018, + "loss": 0.0418, + "step": 900 + }, + { + "epoch": 0.23861314349746227, + "grad_norm": 0.017578125, + "learning_rate": 0.00017625016492941022, + "loss": 0.0195, + "step": 905 + }, + { + "epoch": 0.23993144815766923, + "grad_norm": 0.6171875, + "learning_rate": 0.0001761182214012403, + "loss": 0.067, + "step": 910 + }, + { + "epoch": 0.24124975281787622, + "grad_norm": 0.59765625, + "learning_rate": 0.00017598627787307033, + "loss": 0.049, + "step": 915 + }, + { + "epoch": 0.2425680574780832, + "grad_norm": 1.2421875, + "learning_rate": 0.0001758543343449004, + "loss": 0.0539, + "step": 920 + }, + { + "epoch": 0.24388636213829015, + "grad_norm": 0.10302734375, + "learning_rate": 0.00017572239081673044, + "loss": 0.0725, + "step": 925 + }, + { + "epoch": 0.24520466679849715, + "grad_norm": 0.330078125, + "learning_rate": 0.0001755904472885605, + "loss": 0.064, + "step": 930 + }, + { + "epoch": 0.2465229714587041, + "grad_norm": 0.220703125, + "learning_rate": 0.00017545850376039058, + "loss": 0.0271, + "step": 935 + }, + { + "epoch": 0.24784127611891107, + "grad_norm": 0.01470947265625, + "learning_rate": 0.00017532656023222062, + "loss": 0.0247, + "step": 940 + }, + { + "epoch": 0.24915958077911807, + "grad_norm": 0.013427734375, + "learning_rate": 0.0001751946167040507, + "loss": 0.017, + "step": 945 + }, + { + "epoch": 0.25047788543932503, + "grad_norm": 0.58984375, + "learning_rate": 0.00017506267317588073, + "loss": 0.0254, + "step": 950 + }, + { + "epoch": 0.251796190099532, + "grad_norm": 0.412109375, + "learning_rate": 0.00017493072964771078, + "loss": 0.0186, + "step": 955 + }, + { + "epoch": 0.25311449475973896, + "grad_norm": 0.66796875, + "learning_rate": 0.00017479878611954084, + "loss": 0.0617, + "step": 960 + }, + { + "epoch": 0.25443279941994595, + "grad_norm": 0.322265625, + "learning_rate": 0.00017466684259137089, + "loss": 0.0173, + "step": 965 + }, + { + "epoch": 0.25575110408015295, + "grad_norm": 0.83203125, + "learning_rate": 0.00017453489906320096, + "loss": 0.0512, + "step": 970 + }, + { + "epoch": 0.2570694087403599, + "grad_norm": 0.08447265625, + "learning_rate": 0.000174402955535031, + "loss": 0.0361, + "step": 975 + }, + { + "epoch": 0.2583877134005669, + "grad_norm": 0.423828125, + "learning_rate": 0.00017427101200686107, + "loss": 0.0175, + "step": 980 + }, + { + "epoch": 0.25970601806077387, + "grad_norm": 0.77734375, + "learning_rate": 0.00017413906847869114, + "loss": 0.0139, + "step": 985 + }, + { + "epoch": 0.2610243227209808, + "grad_norm": 0.515625, + "learning_rate": 0.00017400712495052118, + "loss": 0.0948, + "step": 990 + }, + { + "epoch": 0.2623426273811878, + "grad_norm": 1.421875, + "learning_rate": 0.00017387518142235125, + "loss": 0.0406, + "step": 995 + }, + { + "epoch": 0.2636609320413948, + "grad_norm": 0.058837890625, + "learning_rate": 0.0001737432378941813, + "loss": 0.1011, + "step": 1000 + }, + { + "epoch": 0.2636609320413948, + "eval_loss": 0.045552924275398254, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.6113, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 3.684, + "step": 1000 + }, + { + "epoch": 0.26497923670160173, + "grad_norm": 0.380859375, + "learning_rate": 0.00017361129436601136, + "loss": 0.0711, + "step": 1005 + }, + { + "epoch": 0.2662975413618087, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00017347935083784143, + "loss": 0.0218, + "step": 1010 + }, + { + "epoch": 0.2676158460220157, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017334740730967147, + "loss": 0.0301, + "step": 1015 + }, + { + "epoch": 0.26893415068222265, + "grad_norm": 0.2734375, + "learning_rate": 0.00017321546378150154, + "loss": 0.0721, + "step": 1020 + }, + { + "epoch": 0.27025245534242964, + "grad_norm": 0.25390625, + "learning_rate": 0.00017308352025333158, + "loss": 0.0363, + "step": 1025 + }, + { + "epoch": 0.27157076000263664, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017295157672516165, + "loss": 0.0313, + "step": 1030 + }, + { + "epoch": 0.2728890646628436, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0001728196331969917, + "loss": 0.0385, + "step": 1035 + }, + { + "epoch": 0.27420736932305056, + "grad_norm": 0.00787353515625, + "learning_rate": 0.00017268768966882176, + "loss": 0.0405, + "step": 1040 + }, + { + "epoch": 0.27552567398325756, + "grad_norm": 0.484375, + "learning_rate": 0.00017255574614065183, + "loss": 0.0616, + "step": 1045 + }, + { + "epoch": 0.2768439786434645, + "grad_norm": 0.0908203125, + "learning_rate": 0.00017242380261248185, + "loss": 0.0057, + "step": 1050 + }, + { + "epoch": 0.2781622833036715, + "grad_norm": 0.1904296875, + "learning_rate": 0.00017229185908431192, + "loss": 0.0417, + "step": 1055 + }, + { + "epoch": 0.2794805879638785, + "grad_norm": 0.30078125, + "learning_rate": 0.00017215991555614196, + "loss": 0.0346, + "step": 1060 + }, + { + "epoch": 0.2807988926240854, + "grad_norm": 0.016357421875, + "learning_rate": 0.00017202797202797203, + "loss": 0.0295, + "step": 1065 + }, + { + "epoch": 0.2821171972842924, + "grad_norm": 0.490234375, + "learning_rate": 0.0001718960284998021, + "loss": 0.0448, + "step": 1070 + }, + { + "epoch": 0.28343550194449935, + "grad_norm": 0.004241943359375, + "learning_rate": 0.00017176408497163214, + "loss": 0.0051, + "step": 1075 + }, + { + "epoch": 0.28475380660470634, + "grad_norm": 0.01904296875, + "learning_rate": 0.0001716321414434622, + "loss": 0.0894, + "step": 1080 + }, + { + "epoch": 0.28607211126491333, + "grad_norm": 0.83984375, + "learning_rate": 0.00017150019791529225, + "loss": 0.0288, + "step": 1085 + }, + { + "epoch": 0.28739041592512027, + "grad_norm": 0.2021484375, + "learning_rate": 0.00017136825438712232, + "loss": 0.0222, + "step": 1090 + }, + { + "epoch": 0.28870872058532726, + "grad_norm": 0.322265625, + "learning_rate": 0.0001712363108589524, + "loss": 0.0444, + "step": 1095 + }, + { + "epoch": 0.29002702524553425, + "grad_norm": 0.408203125, + "learning_rate": 0.00017110436733078243, + "loss": 0.0828, + "step": 1100 + }, + { + "epoch": 0.2913453299057412, + "grad_norm": 0.04052734375, + "learning_rate": 0.0001709724238026125, + "loss": 0.0725, + "step": 1105 + }, + { + "epoch": 0.2926636345659482, + "grad_norm": 0.2578125, + "learning_rate": 0.00017084048027444254, + "loss": 0.0204, + "step": 1110 + }, + { + "epoch": 0.2939819392261552, + "grad_norm": 0.67578125, + "learning_rate": 0.0001707085367462726, + "loss": 0.0503, + "step": 1115 + }, + { + "epoch": 0.2953002438863621, + "grad_norm": 0.0059814453125, + "learning_rate": 0.00017057659321810265, + "loss": 0.0144, + "step": 1120 + }, + { + "epoch": 0.2966185485465691, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00017044464968993272, + "loss": 0.0044, + "step": 1125 + }, + { + "epoch": 0.2979368532067761, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001703127061617628, + "loss": 0.013, + "step": 1130 + }, + { + "epoch": 0.29925515786698303, + "grad_norm": 0.287109375, + "learning_rate": 0.00017018076263359283, + "loss": 0.0245, + "step": 1135 + }, + { + "epoch": 0.30057346252719, + "grad_norm": 0.26171875, + "learning_rate": 0.0001700488191054229, + "loss": 0.0247, + "step": 1140 + }, + { + "epoch": 0.301891767187397, + "grad_norm": 0.40625, + "learning_rate": 0.00016991687557725294, + "loss": 0.0402, + "step": 1145 + }, + { + "epoch": 0.30321007184760396, + "grad_norm": 1.2578125, + "learning_rate": 0.000169784932049083, + "loss": 0.0071, + "step": 1150 + }, + { + "epoch": 0.30452837650781095, + "grad_norm": 0.330078125, + "learning_rate": 0.00016965298852091306, + "loss": 0.0177, + "step": 1155 + }, + { + "epoch": 0.30584668116801794, + "grad_norm": 0.07275390625, + "learning_rate": 0.0001695210449927431, + "loss": 0.0029, + "step": 1160 + }, + { + "epoch": 0.3071649858282249, + "grad_norm": 0.455078125, + "learning_rate": 0.00016938910146457317, + "loss": 0.0262, + "step": 1165 + }, + { + "epoch": 0.30848329048843187, + "grad_norm": 0.002655029296875, + "learning_rate": 0.0001692571579364032, + "loss": 0.0346, + "step": 1170 + }, + { + "epoch": 0.30980159514863886, + "grad_norm": 0.1748046875, + "learning_rate": 0.00016912521440823328, + "loss": 0.0494, + "step": 1175 + }, + { + "epoch": 0.3111198998088458, + "grad_norm": 1.4609375, + "learning_rate": 0.00016899327088006335, + "loss": 0.0603, + "step": 1180 + }, + { + "epoch": 0.3124382044690528, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001688613273518934, + "loss": 0.0366, + "step": 1185 + }, + { + "epoch": 0.3137565091292598, + "grad_norm": 0.01422119140625, + "learning_rate": 0.00016872938382372346, + "loss": 0.0678, + "step": 1190 + }, + { + "epoch": 0.3150748137894667, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001685974402955535, + "loss": 0.0359, + "step": 1195 + }, + { + "epoch": 0.3163931184496737, + "grad_norm": 0.275390625, + "learning_rate": 0.00016846549676738357, + "loss": 0.1099, + "step": 1200 + }, + { + "epoch": 0.3177114231098807, + "grad_norm": 0.212890625, + "learning_rate": 0.00016833355323921364, + "loss": 0.0343, + "step": 1205 + }, + { + "epoch": 0.31902972777008765, + "grad_norm": 0.0302734375, + "learning_rate": 0.00016820160971104368, + "loss": 0.0138, + "step": 1210 + }, + { + "epoch": 0.32034803243029464, + "grad_norm": 0.016845703125, + "learning_rate": 0.00016806966618287375, + "loss": 0.0202, + "step": 1215 + }, + { + "epoch": 0.32166633709050163, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001679377226547038, + "loss": 0.0442, + "step": 1220 + }, + { + "epoch": 0.32298464175070857, + "grad_norm": 0.049072265625, + "learning_rate": 0.00016780577912653386, + "loss": 0.0375, + "step": 1225 + }, + { + "epoch": 0.32430294641091556, + "grad_norm": 0.1337890625, + "learning_rate": 0.0001676738355983639, + "loss": 0.01, + "step": 1230 + }, + { + "epoch": 0.32562125107112255, + "grad_norm": 0.02197265625, + "learning_rate": 0.00016754189207019397, + "loss": 0.0139, + "step": 1235 + }, + { + "epoch": 0.3269395557313295, + "grad_norm": 0.09228515625, + "learning_rate": 0.00016740994854202404, + "loss": 0.014, + "step": 1240 + }, + { + "epoch": 0.3282578603915365, + "grad_norm": 0.47265625, + "learning_rate": 0.00016727800501385408, + "loss": 0.1546, + "step": 1245 + }, + { + "epoch": 0.3295761650517435, + "grad_norm": 0.02294921875, + "learning_rate": 0.00016714606148568413, + "loss": 0.0803, + "step": 1250 + }, + { + "epoch": 0.3308944697119504, + "grad_norm": 0.185546875, + "learning_rate": 0.00016701411795751417, + "loss": 0.0376, + "step": 1255 + }, + { + "epoch": 0.3322127743721574, + "grad_norm": 0.1123046875, + "learning_rate": 0.00016688217442934424, + "loss": 0.0375, + "step": 1260 + }, + { + "epoch": 0.3335310790323644, + "grad_norm": 1.03125, + "learning_rate": 0.0001667502309011743, + "loss": 0.0442, + "step": 1265 + }, + { + "epoch": 0.33484938369257133, + "grad_norm": 0.0172119140625, + "learning_rate": 0.00016661828737300435, + "loss": 0.0261, + "step": 1270 + }, + { + "epoch": 0.3361676883527783, + "grad_norm": 0.42578125, + "learning_rate": 0.00016648634384483442, + "loss": 0.0553, + "step": 1275 + }, + { + "epoch": 0.3374859930129853, + "grad_norm": 0.1328125, + "learning_rate": 0.00016635440031666446, + "loss": 0.0065, + "step": 1280 + }, + { + "epoch": 0.33880429767319226, + "grad_norm": 0.263671875, + "learning_rate": 0.00016622245678849453, + "loss": 0.0527, + "step": 1285 + }, + { + "epoch": 0.34012260233339925, + "grad_norm": 0.314453125, + "learning_rate": 0.0001660905132603246, + "loss": 0.0297, + "step": 1290 + }, + { + "epoch": 0.34144090699360624, + "grad_norm": 0.04345703125, + "learning_rate": 0.00016595856973215464, + "loss": 0.0477, + "step": 1295 + }, + { + "epoch": 0.3427592116538132, + "grad_norm": 0.08154296875, + "learning_rate": 0.0001658266262039847, + "loss": 0.0298, + "step": 1300 + }, + { + "epoch": 0.34407751631402017, + "grad_norm": 0.08935546875, + "learning_rate": 0.00016569468267581475, + "loss": 0.0481, + "step": 1305 + }, + { + "epoch": 0.34539582097422716, + "grad_norm": 0.06640625, + "learning_rate": 0.00016556273914764482, + "loss": 0.0153, + "step": 1310 + }, + { + "epoch": 0.3467141256344341, + "grad_norm": 0.00592041015625, + "learning_rate": 0.00016543079561947486, + "loss": 0.0111, + "step": 1315 + }, + { + "epoch": 0.3480324302946411, + "grad_norm": 0.2236328125, + "learning_rate": 0.00016529885209130493, + "loss": 0.0309, + "step": 1320 + }, + { + "epoch": 0.3493507349548481, + "grad_norm": 0.0198974609375, + "learning_rate": 0.000165166908563135, + "loss": 0.0579, + "step": 1325 + }, + { + "epoch": 0.350669039615055, + "grad_norm": 0.10107421875, + "learning_rate": 0.00016503496503496504, + "loss": 0.0055, + "step": 1330 + }, + { + "epoch": 0.351987344275262, + "grad_norm": 0.71875, + "learning_rate": 0.00016490302150679511, + "loss": 0.0299, + "step": 1335 + }, + { + "epoch": 0.353305648935469, + "grad_norm": 0.01348876953125, + "learning_rate": 0.00016477107797862516, + "loss": 0.0943, + "step": 1340 + }, + { + "epoch": 0.35462395359567594, + "grad_norm": 0.3046875, + "learning_rate": 0.00016463913445045523, + "loss": 0.0216, + "step": 1345 + }, + { + "epoch": 0.35594225825588294, + "grad_norm": 0.02392578125, + "learning_rate": 0.00016450719092228527, + "loss": 0.0265, + "step": 1350 + }, + { + "epoch": 0.35726056291608993, + "grad_norm": 0.453125, + "learning_rate": 0.0001643752473941153, + "loss": 0.0539, + "step": 1355 + }, + { + "epoch": 0.35857886757629687, + "grad_norm": 0.00823974609375, + "learning_rate": 0.00016424330386594538, + "loss": 0.0139, + "step": 1360 + }, + { + "epoch": 0.35989717223650386, + "grad_norm": 0.55859375, + "learning_rate": 0.00016411136033777542, + "loss": 0.0428, + "step": 1365 + }, + { + "epoch": 0.36121547689671085, + "grad_norm": 0.052734375, + "learning_rate": 0.0001639794168096055, + "loss": 0.0346, + "step": 1370 + }, + { + "epoch": 0.3625337815569178, + "grad_norm": 0.12158203125, + "learning_rate": 0.00016384747328143556, + "loss": 0.0095, + "step": 1375 + }, + { + "epoch": 0.3638520862171248, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0001637155297532656, + "loss": 0.0224, + "step": 1380 + }, + { + "epoch": 0.3651703908773318, + "grad_norm": 0.01318359375, + "learning_rate": 0.00016358358622509567, + "loss": 0.0316, + "step": 1385 + }, + { + "epoch": 0.3664886955375387, + "grad_norm": 0.011962890625, + "learning_rate": 0.0001634516426969257, + "loss": 0.0051, + "step": 1390 + }, + { + "epoch": 0.3678070001977457, + "grad_norm": 0.00396728515625, + "learning_rate": 0.00016331969916875578, + "loss": 0.038, + "step": 1395 + }, + { + "epoch": 0.3691253048579527, + "grad_norm": 0.375, + "learning_rate": 0.00016318775564058585, + "loss": 0.029, + "step": 1400 + }, + { + "epoch": 0.37044360951815963, + "grad_norm": 0.265625, + "learning_rate": 0.0001630558121124159, + "loss": 0.0072, + "step": 1405 + }, + { + "epoch": 0.3717619141783666, + "grad_norm": 0.00127410888671875, + "learning_rate": 0.00016292386858424596, + "loss": 0.0381, + "step": 1410 + }, + { + "epoch": 0.3730802188385736, + "grad_norm": 1.15625, + "learning_rate": 0.000162791925056076, + "loss": 0.0573, + "step": 1415 + }, + { + "epoch": 0.37439852349878056, + "grad_norm": 0.0244140625, + "learning_rate": 0.00016265998152790607, + "loss": 0.051, + "step": 1420 + }, + { + "epoch": 0.37571682815898755, + "grad_norm": 0.0015106201171875, + "learning_rate": 0.00016252803799973612, + "loss": 0.0239, + "step": 1425 + }, + { + "epoch": 0.37703513281919454, + "grad_norm": 0.26953125, + "learning_rate": 0.00016239609447156618, + "loss": 0.0165, + "step": 1430 + }, + { + "epoch": 0.3783534374794015, + "grad_norm": 0.006134033203125, + "learning_rate": 0.00016226415094339625, + "loss": 0.0071, + "step": 1435 + }, + { + "epoch": 0.37967174213960847, + "grad_norm": 2.828125, + "learning_rate": 0.0001621322074152263, + "loss": 0.0272, + "step": 1440 + }, + { + "epoch": 0.38099004679981546, + "grad_norm": 0.349609375, + "learning_rate": 0.00016200026388705637, + "loss": 0.0647, + "step": 1445 + }, + { + "epoch": 0.3823083514600224, + "grad_norm": 0.09326171875, + "learning_rate": 0.00016186832035888638, + "loss": 0.0262, + "step": 1450 + }, + { + "epoch": 0.3836266561202294, + "grad_norm": 0.041015625, + "learning_rate": 0.00016173637683071645, + "loss": 0.0576, + "step": 1455 + }, + { + "epoch": 0.3849449607804364, + "grad_norm": 0.033935546875, + "learning_rate": 0.00016160443330254652, + "loss": 0.0142, + "step": 1460 + }, + { + "epoch": 0.3862632654406433, + "grad_norm": 0.09130859375, + "learning_rate": 0.00016147248977437656, + "loss": 0.0348, + "step": 1465 + }, + { + "epoch": 0.3875815701008503, + "grad_norm": 2.390625, + "learning_rate": 0.00016134054624620663, + "loss": 0.0672, + "step": 1470 + }, + { + "epoch": 0.3888998747610573, + "grad_norm": 0.439453125, + "learning_rate": 0.00016120860271803667, + "loss": 0.0121, + "step": 1475 + }, + { + "epoch": 0.39021817942126424, + "grad_norm": 0.1298828125, + "learning_rate": 0.00016107665918986674, + "loss": 0.0114, + "step": 1480 + }, + { + "epoch": 0.39153648408147124, + "grad_norm": 0.85546875, + "learning_rate": 0.0001609447156616968, + "loss": 0.0968, + "step": 1485 + }, + { + "epoch": 0.39285478874167823, + "grad_norm": 0.703125, + "learning_rate": 0.00016081277213352685, + "loss": 0.0349, + "step": 1490 + }, + { + "epoch": 0.39417309340188517, + "grad_norm": 0.021728515625, + "learning_rate": 0.00016068082860535692, + "loss": 0.0106, + "step": 1495 + }, + { + "epoch": 0.39549139806209216, + "grad_norm": 0.7265625, + "learning_rate": 0.00016054888507718696, + "loss": 0.0225, + "step": 1500 + }, + { + "epoch": 0.39549139806209216, + "eval_loss": 0.03515048325061798, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.3497, + "eval_samples_per_second": 7.373, + "eval_steps_per_second": 3.686, + "step": 1500 + }, + { + "epoch": 0.3968097027222991, + "grad_norm": 0.016519820317626, + "learning_rate": 0.00016041694154901703, + "loss": 0.0202, + "step": 1505 + }, + { + "epoch": 0.3981280073825061, + "grad_norm": 0.8505942225456238, + "learning_rate": 0.00016028499802084708, + "loss": 0.0541, + "step": 1510 + }, + { + "epoch": 0.3994463120427131, + "grad_norm": 0.04163295030593872, + "learning_rate": 0.00016015305449267714, + "loss": 0.0037, + "step": 1515 + }, + { + "epoch": 0.40076461670292, + "grad_norm": 0.011332935653626919, + "learning_rate": 0.00016002111096450721, + "loss": 0.0459, + "step": 1520 + }, + { + "epoch": 0.402082921363127, + "grad_norm": 0.9360129833221436, + "learning_rate": 0.00015988916743633726, + "loss": 0.013, + "step": 1525 + }, + { + "epoch": 0.403401226023334, + "grad_norm": 0.11991436779499054, + "learning_rate": 0.00015975722390816733, + "loss": 0.0079, + "step": 1530 + }, + { + "epoch": 0.40471953068354094, + "grad_norm": 0.36911076307296753, + "learning_rate": 0.00015962528037999737, + "loss": 0.0638, + "step": 1535 + }, + { + "epoch": 0.40603783534374793, + "grad_norm": 0.020278634503483772, + "learning_rate": 0.00015949333685182744, + "loss": 0.0217, + "step": 1540 + }, + { + "epoch": 0.4073561400039549, + "grad_norm": 0.14263059198856354, + "learning_rate": 0.0001593613933236575, + "loss": 0.0495, + "step": 1545 + }, + { + "epoch": 0.40867444466416186, + "grad_norm": 0.09494803845882416, + "learning_rate": 0.00015922944979548752, + "loss": 0.0248, + "step": 1550 + }, + { + "epoch": 0.40999274932436885, + "grad_norm": 0.23064319789409637, + "learning_rate": 0.0001590975062673176, + "loss": 0.0285, + "step": 1555 + }, + { + "epoch": 0.41131105398457585, + "grad_norm": 0.32220256328582764, + "learning_rate": 0.00015896556273914763, + "loss": 0.0537, + "step": 1560 + }, + { + "epoch": 0.4126293586447828, + "grad_norm": 0.41208815574645996, + "learning_rate": 0.0001588336192109777, + "loss": 0.0453, + "step": 1565 + }, + { + "epoch": 0.4139476633049898, + "grad_norm": 0.03775424137711525, + "learning_rate": 0.00015870167568280777, + "loss": 0.0134, + "step": 1570 + }, + { + "epoch": 0.41526596796519677, + "grad_norm": 0.6526333093643188, + "learning_rate": 0.0001585697321546378, + "loss": 0.0329, + "step": 1575 + }, + { + "epoch": 0.4165842726254037, + "grad_norm": 1.001305103302002, + "learning_rate": 0.00015843778862646788, + "loss": 0.0912, + "step": 1580 + }, + { + "epoch": 0.4179025772856107, + "grad_norm": 0.4055219888687134, + "learning_rate": 0.00015830584509829792, + "loss": 0.0519, + "step": 1585 + }, + { + "epoch": 0.4192208819458177, + "grad_norm": 0.035015616565942764, + "learning_rate": 0.000158173901570128, + "loss": 0.0191, + "step": 1590 + }, + { + "epoch": 0.42053918660602463, + "grad_norm": 0.09326844662427902, + "learning_rate": 0.00015804195804195806, + "loss": 0.0106, + "step": 1595 + }, + { + "epoch": 0.4218574912662316, + "grad_norm": 0.06223440542817116, + "learning_rate": 0.0001579100145137881, + "loss": 0.0113, + "step": 1600 + }, + { + "epoch": 0.4231757959264386, + "grad_norm": 0.0625135526061058, + "learning_rate": 0.00015777807098561817, + "loss": 0.0191, + "step": 1605 + }, + { + "epoch": 0.42449410058664555, + "grad_norm": 0.2645983099937439, + "learning_rate": 0.00015764612745744822, + "loss": 0.0829, + "step": 1610 + }, + { + "epoch": 0.42581240524685254, + "grad_norm": 0.009632415138185024, + "learning_rate": 0.00015751418392927829, + "loss": 0.0542, + "step": 1615 + }, + { + "epoch": 0.42713070990705954, + "grad_norm": 0.01979319378733635, + "learning_rate": 0.00015738224040110833, + "loss": 0.0517, + "step": 1620 + }, + { + "epoch": 0.4284490145672665, + "grad_norm": 0.3065454065799713, + "learning_rate": 0.0001572502968729384, + "loss": 0.0738, + "step": 1625 + }, + { + "epoch": 0.42976731922747347, + "grad_norm": 0.09581473469734192, + "learning_rate": 0.00015711835334476847, + "loss": 0.0571, + "step": 1630 + }, + { + "epoch": 0.43108562388768046, + "grad_norm": 0.23746591806411743, + "learning_rate": 0.0001569864098165985, + "loss": 0.0128, + "step": 1635 + }, + { + "epoch": 0.4324039285478874, + "grad_norm": 0.936278760433197, + "learning_rate": 0.00015685446628842858, + "loss": 0.0665, + "step": 1640 + }, + { + "epoch": 0.4337222332080944, + "grad_norm": 0.18487441539764404, + "learning_rate": 0.00015672252276025862, + "loss": 0.0527, + "step": 1645 + }, + { + "epoch": 0.4350405378683014, + "grad_norm": 0.6980624794960022, + "learning_rate": 0.00015659057923208866, + "loss": 0.0613, + "step": 1650 + }, + { + "epoch": 0.4363588425285083, + "grad_norm": 0.4696301221847534, + "learning_rate": 0.00015645863570391873, + "loss": 0.0569, + "step": 1655 + }, + { + "epoch": 0.4376771471887153, + "grad_norm": 0.15083105862140656, + "learning_rate": 0.00015632669217574877, + "loss": 0.0394, + "step": 1660 + }, + { + "epoch": 0.4389954518489223, + "grad_norm": 0.44701239466667175, + "learning_rate": 0.00015619474864757884, + "loss": 0.0494, + "step": 1665 + }, + { + "epoch": 0.44031375650912924, + "grad_norm": 0.07418403029441833, + "learning_rate": 0.00015606280511940888, + "loss": 0.0291, + "step": 1670 + }, + { + "epoch": 0.44163206116933623, + "grad_norm": 0.02311861515045166, + "learning_rate": 0.00015593086159123895, + "loss": 0.0304, + "step": 1675 + }, + { + "epoch": 0.4429503658295432, + "grad_norm": 0.4416038990020752, + "learning_rate": 0.00015579891806306902, + "loss": 0.0176, + "step": 1680 + }, + { + "epoch": 0.44426867048975016, + "grad_norm": 0.5124915242195129, + "learning_rate": 0.00015566697453489906, + "loss": 0.0454, + "step": 1685 + }, + { + "epoch": 0.44558697514995715, + "grad_norm": 0.3159286081790924, + "learning_rate": 0.00015553503100672913, + "loss": 0.047, + "step": 1690 + }, + { + "epoch": 0.44690527981016415, + "grad_norm": 0.032126396894454956, + "learning_rate": 0.00015540308747855918, + "loss": 0.0151, + "step": 1695 + }, + { + "epoch": 0.4482235844703711, + "grad_norm": 0.04663548618555069, + "learning_rate": 0.00015527114395038924, + "loss": 0.0375, + "step": 1700 + }, + { + "epoch": 0.4495418891305781, + "grad_norm": 0.013753900304436684, + "learning_rate": 0.0001551392004222193, + "loss": 0.0485, + "step": 1705 + }, + { + "epoch": 0.45086019379078507, + "grad_norm": 1.9952393770217896, + "learning_rate": 0.00015500725689404936, + "loss": 0.0625, + "step": 1710 + }, + { + "epoch": 0.452178498450992, + "grad_norm": 0.014283270575106144, + "learning_rate": 0.00015487531336587943, + "loss": 0.0037, + "step": 1715 + }, + { + "epoch": 0.453496803111199, + "grad_norm": 0.3897913098335266, + "learning_rate": 0.00015474336983770947, + "loss": 0.0304, + "step": 1720 + }, + { + "epoch": 0.454815107771406, + "grad_norm": 0.3730885684490204, + "learning_rate": 0.00015461142630953954, + "loss": 0.0115, + "step": 1725 + }, + { + "epoch": 0.45613341243161293, + "grad_norm": 0.035858724266290665, + "learning_rate": 0.00015447948278136958, + "loss": 0.0021, + "step": 1730 + }, + { + "epoch": 0.4574517170918199, + "grad_norm": 0.20589517056941986, + "learning_rate": 0.00015434753925319965, + "loss": 0.0132, + "step": 1735 + }, + { + "epoch": 0.4587700217520269, + "grad_norm": 0.004939342383295298, + "learning_rate": 0.00015421559572502972, + "loss": 0.0471, + "step": 1740 + }, + { + "epoch": 0.46008832641223385, + "grad_norm": 0.03493283689022064, + "learning_rate": 0.00015408365219685976, + "loss": 0.0062, + "step": 1745 + }, + { + "epoch": 0.46140663107244084, + "grad_norm": 0.045927103608846664, + "learning_rate": 0.0001539517086686898, + "loss": 0.0283, + "step": 1750 + }, + { + "epoch": 0.46272493573264784, + "grad_norm": 0.012629454955458641, + "learning_rate": 0.00015381976514051984, + "loss": 0.0133, + "step": 1755 + }, + { + "epoch": 0.46404324039285477, + "grad_norm": 0.8001697659492493, + "learning_rate": 0.0001536878216123499, + "loss": 0.0224, + "step": 1760 + }, + { + "epoch": 0.46536154505306176, + "grad_norm": 0.002036362886428833, + "learning_rate": 0.00015355587808417998, + "loss": 0.0066, + "step": 1765 + }, + { + "epoch": 0.46667984971326876, + "grad_norm": 1.0261330604553223, + "learning_rate": 0.00015342393455601002, + "loss": 0.191, + "step": 1770 + }, + { + "epoch": 0.4679981543734757, + "grad_norm": 0.3033429682254791, + "learning_rate": 0.0001532919910278401, + "loss": 0.0222, + "step": 1775 + }, + { + "epoch": 0.4693164590336827, + "grad_norm": 0.36911338567733765, + "learning_rate": 0.00015316004749967014, + "loss": 0.0363, + "step": 1780 + }, + { + "epoch": 0.4706347636938897, + "grad_norm": 0.0406811460852623, + "learning_rate": 0.0001530281039715002, + "loss": 0.0283, + "step": 1785 + }, + { + "epoch": 0.4719530683540966, + "grad_norm": 0.23334211111068726, + "learning_rate": 0.00015289616044333027, + "loss": 0.0274, + "step": 1790 + }, + { + "epoch": 0.4732713730143036, + "grad_norm": 0.013081169687211514, + "learning_rate": 0.00015276421691516032, + "loss": 0.0221, + "step": 1795 + }, + { + "epoch": 0.4745896776745106, + "grad_norm": 0.2480790615081787, + "learning_rate": 0.00015263227338699039, + "loss": 0.019, + "step": 1800 + }, + { + "epoch": 0.47590798233471754, + "grad_norm": 0.0373196005821228, + "learning_rate": 0.00015250032985882043, + "loss": 0.0292, + "step": 1805 + }, + { + "epoch": 0.47722628699492453, + "grad_norm": 0.004609994124621153, + "learning_rate": 0.0001523683863306505, + "loss": 0.0918, + "step": 1810 + }, + { + "epoch": 0.4785445916551315, + "grad_norm": 0.02370987832546234, + "learning_rate": 0.00015223644280248054, + "loss": 0.0462, + "step": 1815 + }, + { + "epoch": 0.47986289631533846, + "grad_norm": 0.05842221528291702, + "learning_rate": 0.0001521044992743106, + "loss": 0.0595, + "step": 1820 + }, + { + "epoch": 0.48118120097554545, + "grad_norm": 0.009685276076197624, + "learning_rate": 0.00015197255574614068, + "loss": 0.0074, + "step": 1825 + }, + { + "epoch": 0.48249950563575245, + "grad_norm": 0.8933250308036804, + "learning_rate": 0.00015184061221797072, + "loss": 0.0757, + "step": 1830 + }, + { + "epoch": 0.4838178102959594, + "grad_norm": 0.07075401395559311, + "learning_rate": 0.0001517086686898008, + "loss": 0.0226, + "step": 1835 + }, + { + "epoch": 0.4851361149561664, + "grad_norm": 0.732706606388092, + "learning_rate": 0.00015157672516163083, + "loss": 0.0161, + "step": 1840 + }, + { + "epoch": 0.48645441961637337, + "grad_norm": 1.1897023916244507, + "learning_rate": 0.0001514447816334609, + "loss": 0.0265, + "step": 1845 + }, + { + "epoch": 0.4877727242765803, + "grad_norm": 0.052572328597307205, + "learning_rate": 0.00015131283810529094, + "loss": 0.0094, + "step": 1850 + }, + { + "epoch": 0.4890910289367873, + "grad_norm": 0.08263898640871048, + "learning_rate": 0.00015118089457712098, + "loss": 0.0631, + "step": 1855 + }, + { + "epoch": 0.4904093335969943, + "grad_norm": 0.03225664421916008, + "learning_rate": 0.00015104895104895105, + "loss": 0.023, + "step": 1860 + }, + { + "epoch": 0.4917276382572012, + "grad_norm": 0.007935039699077606, + "learning_rate": 0.0001509170075207811, + "loss": 0.0039, + "step": 1865 + }, + { + "epoch": 0.4930459429174082, + "grad_norm": 0.00830796267837286, + "learning_rate": 0.00015078506399261116, + "loss": 0.007, + "step": 1870 + }, + { + "epoch": 0.4943642475776152, + "grad_norm": 0.08042234182357788, + "learning_rate": 0.00015065312046444123, + "loss": 0.0366, + "step": 1875 + }, + { + "epoch": 0.49568255223782215, + "grad_norm": 0.009092851541936398, + "learning_rate": 0.00015052117693627128, + "loss": 0.0107, + "step": 1880 + }, + { + "epoch": 0.49700085689802914, + "grad_norm": 0.2674141824245453, + "learning_rate": 0.00015038923340810135, + "loss": 0.0076, + "step": 1885 + }, + { + "epoch": 0.49831916155823613, + "grad_norm": 0.07694366574287415, + "learning_rate": 0.0001502572898799314, + "loss": 0.0252, + "step": 1890 + }, + { + "epoch": 0.49963746621844307, + "grad_norm": 0.5699467062950134, + "learning_rate": 0.00015012534635176146, + "loss": 0.0487, + "step": 1895 + }, + { + "epoch": 0.5009557708786501, + "grad_norm": 0.18800878524780273, + "learning_rate": 0.0001499934028235915, + "loss": 0.0183, + "step": 1900 + }, + { + "epoch": 0.5022740755388571, + "grad_norm": 0.019469989463686943, + "learning_rate": 0.00014986145929542157, + "loss": 0.0268, + "step": 1905 + }, + { + "epoch": 0.503592380199064, + "grad_norm": 0.01890506222844124, + "learning_rate": 0.00014972951576725164, + "loss": 0.0449, + "step": 1910 + }, + { + "epoch": 0.5049106848592709, + "grad_norm": 0.0006314461352303624, + "learning_rate": 0.00014959757223908168, + "loss": 0.0056, + "step": 1915 + }, + { + "epoch": 0.5062289895194779, + "grad_norm": 0.32654041051864624, + "learning_rate": 0.00014946562871091175, + "loss": 0.0256, + "step": 1920 + }, + { + "epoch": 0.5075472941796849, + "grad_norm": 0.7803483605384827, + "learning_rate": 0.0001493336851827418, + "loss": 0.0374, + "step": 1925 + }, + { + "epoch": 0.5088655988398919, + "grad_norm": 0.028441445901989937, + "learning_rate": 0.00014920174165457186, + "loss": 0.0161, + "step": 1930 + }, + { + "epoch": 0.5101839035000989, + "grad_norm": 0.028379200026392937, + "learning_rate": 0.00014906979812640193, + "loss": 0.0151, + "step": 1935 + }, + { + "epoch": 0.5115022081603059, + "grad_norm": 0.021159596741199493, + "learning_rate": 0.00014893785459823197, + "loss": 0.0303, + "step": 1940 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.24903325736522675, + "learning_rate": 0.000148805911070062, + "loss": 0.0076, + "step": 1945 + }, + { + "epoch": 0.5141388174807198, + "grad_norm": 0.007065301761031151, + "learning_rate": 0.00014867396754189206, + "loss": 0.022, + "step": 1950 + }, + { + "epoch": 0.5154571221409268, + "grad_norm": 0.004032329190522432, + "learning_rate": 0.00014854202401372212, + "loss": 0.0083, + "step": 1955 + }, + { + "epoch": 0.5167754268011338, + "grad_norm": 0.3045775592327118, + "learning_rate": 0.0001484100804855522, + "loss": 0.0113, + "step": 1960 + }, + { + "epoch": 0.5180937314613407, + "grad_norm": 0.36974939703941345, + "learning_rate": 0.00014827813695738224, + "loss": 0.0267, + "step": 1965 + }, + { + "epoch": 0.5194120361215477, + "grad_norm": 0.009729950688779354, + "learning_rate": 0.0001481461934292123, + "loss": 0.027, + "step": 1970 + }, + { + "epoch": 0.5207303407817546, + "grad_norm": 0.0013097926275804639, + "learning_rate": 0.00014801424990104235, + "loss": 0.003, + "step": 1975 + }, + { + "epoch": 0.5220486454419616, + "grad_norm": 0.0706263929605484, + "learning_rate": 0.00014788230637287242, + "loss": 0.0193, + "step": 1980 + }, + { + "epoch": 0.5233669501021686, + "grad_norm": 1.435702919960022, + "learning_rate": 0.00014775036284470249, + "loss": 0.0647, + "step": 1985 + }, + { + "epoch": 0.5246852547623756, + "grad_norm": 0.00661757867783308, + "learning_rate": 0.00014761841931653253, + "loss": 0.0373, + "step": 1990 + }, + { + "epoch": 0.5260035594225826, + "grad_norm": 0.12014541029930115, + "learning_rate": 0.0001474864757883626, + "loss": 0.0178, + "step": 1995 + }, + { + "epoch": 0.5273218640827896, + "grad_norm": 1.0549248456954956, + "learning_rate": 0.00014735453226019264, + "loss": 0.0191, + "step": 2000 + }, + { + "epoch": 0.5273218640827896, + "eval_loss": 0.037292081862688065, + "eval_runtime": 454.3033, + "eval_samples_per_second": 7.422, + "eval_steps_per_second": 3.711, + "step": 2000 + }, + { + "epoch": 0.5286401687429965, + "grad_norm": 0.47634151577949524, + "learning_rate": 0.0001472225887320227, + "loss": 0.0404, + "step": 2005 + }, + { + "epoch": 0.5299584734032035, + "grad_norm": 0.006752463988959789, + "learning_rate": 0.00014709064520385275, + "loss": 0.034, + "step": 2010 + }, + { + "epoch": 0.5312767780634104, + "grad_norm": 0.20780125260353088, + "learning_rate": 0.00014695870167568282, + "loss": 0.0421, + "step": 2015 + }, + { + "epoch": 0.5325950827236174, + "grad_norm": 0.010941066779196262, + "learning_rate": 0.0001468267581475129, + "loss": 0.0086, + "step": 2020 + }, + { + "epoch": 0.5339133873838244, + "grad_norm": 0.3439581096172333, + "learning_rate": 0.00014669481461934293, + "loss": 0.0187, + "step": 2025 + }, + { + "epoch": 0.5352316920440314, + "grad_norm": 0.14961636066436768, + "learning_rate": 0.000146562871091173, + "loss": 0.0504, + "step": 2030 + }, + { + "epoch": 0.5365499967042383, + "grad_norm": 0.0044641937129199505, + "learning_rate": 0.00014643092756300304, + "loss": 0.0134, + "step": 2035 + }, + { + "epoch": 0.5378683013644453, + "grad_norm": 0.14088386297225952, + "learning_rate": 0.0001462989840348331, + "loss": 0.0096, + "step": 2040 + }, + { + "epoch": 0.5391866060246523, + "grad_norm": 0.48116979002952576, + "learning_rate": 0.00014616704050666315, + "loss": 0.0124, + "step": 2045 + }, + { + "epoch": 0.5405049106848593, + "grad_norm": 0.3688766360282898, + "learning_rate": 0.0001460350969784932, + "loss": 0.0226, + "step": 2050 + }, + { + "epoch": 0.5418232153450663, + "grad_norm": 0.002938181860372424, + "learning_rate": 0.00014590315345032326, + "loss": 0.0267, + "step": 2055 + }, + { + "epoch": 0.5431415200052733, + "grad_norm": 0.3335214853286743, + "learning_rate": 0.0001457712099221533, + "loss": 0.0367, + "step": 2060 + }, + { + "epoch": 0.5444598246654802, + "grad_norm": 0.004644686821848154, + "learning_rate": 0.00014563926639398338, + "loss": 0.0121, + "step": 2065 + }, + { + "epoch": 0.5457781293256871, + "grad_norm": 0.19505545496940613, + "learning_rate": 0.00014550732286581345, + "loss": 0.0591, + "step": 2070 + }, + { + "epoch": 0.5470964339858941, + "grad_norm": 0.018028756603598595, + "learning_rate": 0.0001453753793376435, + "loss": 0.0131, + "step": 2075 + }, + { + "epoch": 0.5484147386461011, + "grad_norm": 0.045639291405677795, + "learning_rate": 0.00014524343580947356, + "loss": 0.0443, + "step": 2080 + }, + { + "epoch": 0.5497330433063081, + "grad_norm": 0.727981686592102, + "learning_rate": 0.0001451114922813036, + "loss": 0.0205, + "step": 2085 + }, + { + "epoch": 0.5510513479665151, + "grad_norm": 0.03766491636633873, + "learning_rate": 0.00014497954875313367, + "loss": 0.0067, + "step": 2090 + }, + { + "epoch": 0.552369652626722, + "grad_norm": 0.1911504715681076, + "learning_rate": 0.0001448476052249637, + "loss": 0.0397, + "step": 2095 + }, + { + "epoch": 0.553687957286929, + "grad_norm": 0.08238353580236435, + "learning_rate": 0.00014471566169679378, + "loss": 0.0513, + "step": 2100 + }, + { + "epoch": 0.555006261947136, + "grad_norm": 0.06317206472158432, + "learning_rate": 0.00014458371816862385, + "loss": 0.0178, + "step": 2105 + }, + { + "epoch": 0.556324566607343, + "grad_norm": 0.0652734637260437, + "learning_rate": 0.0001444517746404539, + "loss": 0.0184, + "step": 2110 + }, + { + "epoch": 0.55764287126755, + "grad_norm": 0.05471858009696007, + "learning_rate": 0.00014431983111228396, + "loss": 0.0089, + "step": 2115 + }, + { + "epoch": 0.558961175927757, + "grad_norm": 0.005062670446932316, + "learning_rate": 0.000144187887584114, + "loss": 0.0052, + "step": 2120 + }, + { + "epoch": 0.5602794805879638, + "grad_norm": 0.06337414681911469, + "learning_rate": 0.00014405594405594407, + "loss": 0.053, + "step": 2125 + }, + { + "epoch": 0.5615977852481708, + "grad_norm": 0.33745357394218445, + "learning_rate": 0.00014392400052777414, + "loss": 0.0166, + "step": 2130 + }, + { + "epoch": 0.5629160899083778, + "grad_norm": 0.7382741570472717, + "learning_rate": 0.00014379205699960418, + "loss": 0.0191, + "step": 2135 + }, + { + "epoch": 0.5642343945685848, + "grad_norm": 0.007551972754299641, + "learning_rate": 0.00014366011347143425, + "loss": 0.0022, + "step": 2140 + }, + { + "epoch": 0.5655526992287918, + "grad_norm": 0.6260896921157837, + "learning_rate": 0.00014352816994326427, + "loss": 0.0095, + "step": 2145 + }, + { + "epoch": 0.5668710038889987, + "grad_norm": 0.11619322001934052, + "learning_rate": 0.00014339622641509434, + "loss": 0.015, + "step": 2150 + }, + { + "epoch": 0.5681893085492057, + "grad_norm": 1.1440670490264893, + "learning_rate": 0.0001432642828869244, + "loss": 0.1343, + "step": 2155 + }, + { + "epoch": 0.5695076132094127, + "grad_norm": 1.1793878078460693, + "learning_rate": 0.00014313233935875445, + "loss": 0.0968, + "step": 2160 + }, + { + "epoch": 0.5708259178696197, + "grad_norm": 0.6865736842155457, + "learning_rate": 0.00014300039583058452, + "loss": 0.0195, + "step": 2165 + }, + { + "epoch": 0.5721442225298267, + "grad_norm": 0.140816792845726, + "learning_rate": 0.00014286845230241456, + "loss": 0.0761, + "step": 2170 + }, + { + "epoch": 0.5734625271900337, + "grad_norm": 0.04071786254644394, + "learning_rate": 0.00014273650877424463, + "loss": 0.0193, + "step": 2175 + }, + { + "epoch": 0.5747808318502405, + "grad_norm": 0.044617727398872375, + "learning_rate": 0.0001426045652460747, + "loss": 0.0112, + "step": 2180 + }, + { + "epoch": 0.5760991365104475, + "grad_norm": 0.11001799255609512, + "learning_rate": 0.00014247262171790474, + "loss": 0.0039, + "step": 2185 + }, + { + "epoch": 0.5774174411706545, + "grad_norm": 0.0036315324250608683, + "learning_rate": 0.0001423406781897348, + "loss": 0.0038, + "step": 2190 + }, + { + "epoch": 0.5787357458308615, + "grad_norm": 0.9866570830345154, + "learning_rate": 0.00014220873466156485, + "loss": 0.025, + "step": 2195 + }, + { + "epoch": 0.5800540504910685, + "grad_norm": 0.023570384830236435, + "learning_rate": 0.00014207679113339492, + "loss": 0.0468, + "step": 2200 + }, + { + "epoch": 0.5813723551512755, + "grad_norm": 0.20010559260845184, + "learning_rate": 0.00014194484760522496, + "loss": 0.0198, + "step": 2205 + }, + { + "epoch": 0.5826906598114824, + "grad_norm": 0.06153270602226257, + "learning_rate": 0.00014181290407705503, + "loss": 0.0764, + "step": 2210 + }, + { + "epoch": 0.5840089644716894, + "grad_norm": 0.033162448555231094, + "learning_rate": 0.0001416809605488851, + "loss": 0.028, + "step": 2215 + }, + { + "epoch": 0.5853272691318964, + "grad_norm": 0.428382933139801, + "learning_rate": 0.00014154901702071514, + "loss": 0.0652, + "step": 2220 + }, + { + "epoch": 0.5866455737921034, + "grad_norm": 0.25004762411117554, + "learning_rate": 0.0001414170734925452, + "loss": 0.0411, + "step": 2225 + }, + { + "epoch": 0.5879638784523104, + "grad_norm": 0.22649863362312317, + "learning_rate": 0.00014128512996437525, + "loss": 0.0517, + "step": 2230 + }, + { + "epoch": 0.5892821831125173, + "grad_norm": 0.035932112485170364, + "learning_rate": 0.00014115318643620532, + "loss": 0.015, + "step": 2235 + }, + { + "epoch": 0.5906004877727242, + "grad_norm": 0.3800172507762909, + "learning_rate": 0.00014102124290803536, + "loss": 0.0324, + "step": 2240 + }, + { + "epoch": 0.5919187924329312, + "grad_norm": 0.6974118947982788, + "learning_rate": 0.0001408892993798654, + "loss": 0.0216, + "step": 2245 + }, + { + "epoch": 0.5932370970931382, + "grad_norm": 0.15472032129764557, + "learning_rate": 0.00014075735585169548, + "loss": 0.0164, + "step": 2250 + }, + { + "epoch": 0.5945554017533452, + "grad_norm": 0.015000814571976662, + "learning_rate": 0.00014062541232352552, + "loss": 0.0395, + "step": 2255 + }, + { + "epoch": 0.5958737064135522, + "grad_norm": 0.052086081355810165, + "learning_rate": 0.0001404934687953556, + "loss": 0.0032, + "step": 2260 + }, + { + "epoch": 0.5971920110737592, + "grad_norm": 0.004600350745022297, + "learning_rate": 0.00014036152526718566, + "loss": 0.0056, + "step": 2265 + }, + { + "epoch": 0.5985103157339661, + "grad_norm": 0.4940958321094513, + "learning_rate": 0.0001402295817390157, + "loss": 0.0206, + "step": 2270 + }, + { + "epoch": 0.5998286203941731, + "grad_norm": 0.09658394008874893, + "learning_rate": 0.00014009763821084577, + "loss": 0.0052, + "step": 2275 + }, + { + "epoch": 0.60114692505438, + "grad_norm": 0.00020539117394946516, + "learning_rate": 0.0001399656946826758, + "loss": 0.087, + "step": 2280 + }, + { + "epoch": 0.602465229714587, + "grad_norm": 0.1871018409729004, + "learning_rate": 0.00013983375115450588, + "loss": 0.0812, + "step": 2285 + }, + { + "epoch": 0.603783534374794, + "grad_norm": 0.02583954855799675, + "learning_rate": 0.00013970180762633592, + "loss": 0.0232, + "step": 2290 + }, + { + "epoch": 0.605101839035001, + "grad_norm": 1.2103784084320068, + "learning_rate": 0.000139569864098166, + "loss": 0.0151, + "step": 2295 + }, + { + "epoch": 0.6064201436952079, + "grad_norm": 0.023514943197369576, + "learning_rate": 0.00013943792056999606, + "loss": 0.0193, + "step": 2300 + }, + { + "epoch": 0.6077384483554149, + "grad_norm": 0.0076395305804908276, + "learning_rate": 0.0001393059770418261, + "loss": 0.0379, + "step": 2305 + }, + { + "epoch": 0.6090567530156219, + "grad_norm": 0.12412039190530777, + "learning_rate": 0.00013917403351365617, + "loss": 0.0095, + "step": 2310 + }, + { + "epoch": 0.6103750576758289, + "grad_norm": 0.021904783323407173, + "learning_rate": 0.0001390420899854862, + "loss": 0.0166, + "step": 2315 + }, + { + "epoch": 0.6116933623360359, + "grad_norm": 0.004012851510196924, + "learning_rate": 0.00013891014645731628, + "loss": 0.0103, + "step": 2320 + }, + { + "epoch": 0.6130116669962429, + "grad_norm": 0.007267913781106472, + "learning_rate": 0.00013877820292914635, + "loss": 0.0708, + "step": 2325 + }, + { + "epoch": 0.6143299716564498, + "grad_norm": 0.10363642126321793, + "learning_rate": 0.0001386462594009764, + "loss": 0.0473, + "step": 2330 + }, + { + "epoch": 0.6156482763166568, + "grad_norm": 0.04899830371141434, + "learning_rate": 0.00013851431587280646, + "loss": 0.0283, + "step": 2335 + }, + { + "epoch": 0.6169665809768637, + "grad_norm": 0.39460498094558716, + "learning_rate": 0.0001383823723446365, + "loss": 0.0597, + "step": 2340 + }, + { + "epoch": 0.6182848856370707, + "grad_norm": 0.04092290997505188, + "learning_rate": 0.00013825042881646655, + "loss": 0.0167, + "step": 2345 + }, + { + "epoch": 0.6196031902972777, + "grad_norm": 0.2781132161617279, + "learning_rate": 0.00013811848528829662, + "loss": 0.0097, + "step": 2350 + }, + { + "epoch": 0.6209214949574847, + "grad_norm": 0.041443537920713425, + "learning_rate": 0.00013798654176012666, + "loss": 0.0226, + "step": 2355 + }, + { + "epoch": 0.6222397996176916, + "grad_norm": 0.1242462694644928, + "learning_rate": 0.00013785459823195673, + "loss": 0.0055, + "step": 2360 + }, + { + "epoch": 0.6235581042778986, + "grad_norm": 0.4440467357635498, + "learning_rate": 0.00013772265470378677, + "loss": 0.049, + "step": 2365 + }, + { + "epoch": 0.6248764089381056, + "grad_norm": 0.014354427345097065, + "learning_rate": 0.00013759071117561684, + "loss": 0.0327, + "step": 2370 + }, + { + "epoch": 0.6261947135983126, + "grad_norm": 0.011539973318576813, + "learning_rate": 0.0001374587676474469, + "loss": 0.0222, + "step": 2375 + }, + { + "epoch": 0.6275130182585196, + "grad_norm": 0.23539051413536072, + "learning_rate": 0.00013732682411927695, + "loss": 0.0816, + "step": 2380 + }, + { + "epoch": 0.6288313229187266, + "grad_norm": 0.26793941855430603, + "learning_rate": 0.00013719488059110702, + "loss": 0.0325, + "step": 2385 + }, + { + "epoch": 0.6301496275789334, + "grad_norm": 0.01662217453122139, + "learning_rate": 0.00013706293706293706, + "loss": 0.0221, + "step": 2390 + }, + { + "epoch": 0.6314679322391404, + "grad_norm": 0.30669671297073364, + "learning_rate": 0.00013693099353476713, + "loss": 0.026, + "step": 2395 + }, + { + "epoch": 0.6327862368993474, + "grad_norm": 0.03350894898176193, + "learning_rate": 0.00013679905000659717, + "loss": 0.0072, + "step": 2400 + }, + { + "epoch": 0.6341045415595544, + "grad_norm": 0.014983875676989555, + "learning_rate": 0.00013666710647842724, + "loss": 0.049, + "step": 2405 + }, + { + "epoch": 0.6354228462197614, + "grad_norm": 1.8989384174346924, + "learning_rate": 0.0001365351629502573, + "loss": 0.0335, + "step": 2410 + }, + { + "epoch": 0.6367411508799684, + "grad_norm": 0.030135562643408775, + "learning_rate": 0.00013640321942208735, + "loss": 0.0051, + "step": 2415 + }, + { + "epoch": 0.6380594555401753, + "grad_norm": 0.02079075388610363, + "learning_rate": 0.00013627127589391742, + "loss": 0.0138, + "step": 2420 + }, + { + "epoch": 0.6393777602003823, + "grad_norm": 0.06065403297543526, + "learning_rate": 0.00013613933236574746, + "loss": 0.0357, + "step": 2425 + }, + { + "epoch": 0.6406960648605893, + "grad_norm": 0.2980937659740448, + "learning_rate": 0.00013600738883757753, + "loss": 0.0138, + "step": 2430 + }, + { + "epoch": 0.6420143695207963, + "grad_norm": 0.4820438623428345, + "learning_rate": 0.00013587544530940758, + "loss": 0.01, + "step": 2435 + }, + { + "epoch": 0.6433326741810033, + "grad_norm": 0.005618259310722351, + "learning_rate": 0.00013574350178123765, + "loss": 0.0052, + "step": 2440 + }, + { + "epoch": 0.6446509788412103, + "grad_norm": 0.7173821926116943, + "learning_rate": 0.0001356115582530677, + "loss": 0.0133, + "step": 2445 + }, + { + "epoch": 0.6459692835014171, + "grad_norm": 0.0053142281249165535, + "learning_rate": 0.00013547961472489773, + "loss": 0.0045, + "step": 2450 + }, + { + "epoch": 0.6472875881616241, + "grad_norm": 0.06118829548358917, + "learning_rate": 0.0001353476711967278, + "loss": 0.056, + "step": 2455 + }, + { + "epoch": 0.6486058928218311, + "grad_norm": 3.5878078937530518, + "learning_rate": 0.00013521572766855787, + "loss": 0.0232, + "step": 2460 + }, + { + "epoch": 0.6499241974820381, + "grad_norm": 0.004911276511847973, + "learning_rate": 0.0001350837841403879, + "loss": 0.0074, + "step": 2465 + }, + { + "epoch": 0.6512425021422451, + "grad_norm": 0.0028026222717016935, + "learning_rate": 0.00013495184061221798, + "loss": 0.0782, + "step": 2470 + }, + { + "epoch": 0.6525608068024521, + "grad_norm": 0.7317615747451782, + "learning_rate": 0.00013481989708404802, + "loss": 0.0222, + "step": 2475 + }, + { + "epoch": 0.653879111462659, + "grad_norm": 0.01835751160979271, + "learning_rate": 0.0001346879535558781, + "loss": 0.0661, + "step": 2480 + }, + { + "epoch": 0.655197416122866, + "grad_norm": 0.03598962351679802, + "learning_rate": 0.00013455601002770813, + "loss": 0.0395, + "step": 2485 + }, + { + "epoch": 0.656515720783073, + "grad_norm": 0.013886351138353348, + "learning_rate": 0.0001344240664995382, + "loss": 0.0156, + "step": 2490 + }, + { + "epoch": 0.65783402544328, + "grad_norm": 5.741530895233154, + "learning_rate": 0.00013429212297136827, + "loss": 0.0317, + "step": 2495 + }, + { + "epoch": 0.659152330103487, + "grad_norm": 0.20793496072292328, + "learning_rate": 0.0001341601794431983, + "loss": 0.0072, + "step": 2500 + }, + { + "epoch": 0.659152330103487, + "eval_loss": 0.0300898440182209, + "eval_runtime": 453.0554, + "eval_samples_per_second": 7.443, + "eval_steps_per_second": 3.721, + "step": 2500 + } + ], + "logging_steps": 5, + "max_steps": 7584, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.0176108255414272e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2500/training_args.bin b/checkpoint-2500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4c05e0585b1aef03c0cbe4c50207ce04940bd838 --- /dev/null +++ b/checkpoint-2500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79dfa687fdd0c9908ab6b63535817e7567b29b0b483ac228723218f6f5fdeec5 +size 5688 diff --git a/checkpoint-3000/README.md b/checkpoint-3000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e738b5eda9883f4a45efd9d37b8b8357ba758456 --- /dev/null +++ b/checkpoint-3000/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/Llama-3.2-3B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-3000/adapter_config.json b/checkpoint-3000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..852d02d4dc1204f4332be8b3363041a3d3e3feb3 --- /dev/null +++ b/checkpoint-3000/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/Llama-3.2-3B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "gate_proj", + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-3000/adapter_model.safetensors b/checkpoint-3000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8726bdbbf0458c630f2cb56144243917e008e1de --- /dev/null +++ b/checkpoint-3000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f0f44795d8575a3c49edf9d1d7c450bed2a65e0cb892185ee78c4885650e54f +size 97307544 diff --git a/checkpoint-3000/optimizer.pt b/checkpoint-3000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e510b1b76c852404c3a7052236923fc44acf18e1 --- /dev/null +++ b/checkpoint-3000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c41fb29aec65b50a5d99c079cd25509fe2a31b0a42659ed697257bb037877391 +size 50866370 diff --git a/checkpoint-3000/rng_state.pth b/checkpoint-3000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2c4dce652584d7d056c998c5fab0505aab7f484e --- /dev/null +++ b/checkpoint-3000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cf932362a907148c2c99f7826e21fa7280b5016d990a6358e60ec3cec98b016 +size 14244 diff --git a/checkpoint-3000/scheduler.pt b/checkpoint-3000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b29b6a21e99a09c0a0d353fe405a5209e47dc74 --- /dev/null +++ b/checkpoint-3000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11e649e977f25e5103317e72d38179f43abd114c1b5c871edab687a3d17f73b4 +size 1064 diff --git a/checkpoint-3000/special_tokens_map.json b/checkpoint-3000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-3000/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-3000/tokenizer.json b/checkpoint-3000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-3000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-3000/tokenizer_config.json b/checkpoint-3000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfd835b0ef1033e89629af6e13ae45397e9fa2f8 --- /dev/null +++ b/checkpoint-3000/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 July 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-3000/trainer_state.json b/checkpoint-3000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..53d110c0b50f3c48418ab00932384cf47d2936c6 --- /dev/null +++ b/checkpoint-3000/trainer_state.json @@ -0,0 +1,4284 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7909827961241843, + "eval_steps": 500, + "global_step": 3000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001318304660206974, + "grad_norm": 4.59375, + "learning_rate": 0.0002, + "loss": 1.9624, + "step": 5 + }, + { + "epoch": 0.002636609320413948, + "grad_norm": 1.7421875, + "learning_rate": 0.00019986805647183008, + "loss": 0.6513, + "step": 10 + }, + { + "epoch": 0.003954913980620921, + "grad_norm": 1.84375, + "learning_rate": 0.00019973611294366012, + "loss": 0.1146, + "step": 15 + }, + { + "epoch": 0.005273218640827896, + "grad_norm": 1.3203125, + "learning_rate": 0.0001996041694154902, + "loss": 0.0529, + "step": 20 + }, + { + "epoch": 0.006591523301034869, + "grad_norm": 0.40234375, + "learning_rate": 0.00019947222588732023, + "loss": 0.1214, + "step": 25 + }, + { + "epoch": 0.007909827961241843, + "grad_norm": 1.5390625, + "learning_rate": 0.0001993402823591503, + "loss": 0.0919, + "step": 30 + }, + { + "epoch": 0.009228132621448816, + "grad_norm": 0.06201171875, + "learning_rate": 0.00019920833883098034, + "loss": 0.09, + "step": 35 + }, + { + "epoch": 0.010546437281655791, + "grad_norm": 1.53125, + "learning_rate": 0.0001990763953028104, + "loss": 0.1945, + "step": 40 + }, + { + "epoch": 0.011864741941862765, + "grad_norm": 0.2890625, + "learning_rate": 0.00019894445177464048, + "loss": 0.1259, + "step": 45 + }, + { + "epoch": 0.013183046602069738, + "grad_norm": 0.609375, + "learning_rate": 0.00019881250824647052, + "loss": 0.027, + "step": 50 + }, + { + "epoch": 0.014501351262276712, + "grad_norm": 0.369140625, + "learning_rate": 0.00019868056471830057, + "loss": 0.1068, + "step": 55 + }, + { + "epoch": 0.015819655922483685, + "grad_norm": 0.34765625, + "learning_rate": 0.00019854862119013064, + "loss": 0.0542, + "step": 60 + }, + { + "epoch": 0.01713796058269066, + "grad_norm": 0.055419921875, + "learning_rate": 0.00019841667766196068, + "loss": 0.0901, + "step": 65 + }, + { + "epoch": 0.018456265242897632, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00019828473413379075, + "loss": 0.0091, + "step": 70 + }, + { + "epoch": 0.019774569903104607, + "grad_norm": 0.0079345703125, + "learning_rate": 0.0001981527906056208, + "loss": 0.0744, + "step": 75 + }, + { + "epoch": 0.021092874563311582, + "grad_norm": 0.65234375, + "learning_rate": 0.00019802084707745086, + "loss": 0.1108, + "step": 80 + }, + { + "epoch": 0.022411179223518554, + "grad_norm": 0.50390625, + "learning_rate": 0.0001978889035492809, + "loss": 0.0446, + "step": 85 + }, + { + "epoch": 0.02372948388372553, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019775696002111097, + "loss": 0.0982, + "step": 90 + }, + { + "epoch": 0.0250477885439325, + "grad_norm": 0.490234375, + "learning_rate": 0.00019762501649294104, + "loss": 0.1035, + "step": 95 + }, + { + "epoch": 0.026366093204139476, + "grad_norm": 0.12158203125, + "learning_rate": 0.00019749307296477108, + "loss": 0.0401, + "step": 100 + }, + { + "epoch": 0.02768439786434645, + "grad_norm": 0.16015625, + "learning_rate": 0.00019736112943660115, + "loss": 0.0309, + "step": 105 + }, + { + "epoch": 0.029002702524553423, + "grad_norm": 1.359375, + "learning_rate": 0.0001972291859084312, + "loss": 0.1032, + "step": 110 + }, + { + "epoch": 0.0303210071847604, + "grad_norm": 0.52734375, + "learning_rate": 0.00019709724238026126, + "loss": 0.0811, + "step": 115 + }, + { + "epoch": 0.03163931184496737, + "grad_norm": 0.177734375, + "learning_rate": 0.00019696529885209133, + "loss": 0.0258, + "step": 120 + }, + { + "epoch": 0.03295761650517435, + "grad_norm": 0.234375, + "learning_rate": 0.00019683335532392137, + "loss": 0.0437, + "step": 125 + }, + { + "epoch": 0.03427592116538132, + "grad_norm": 1.3046875, + "learning_rate": 0.00019670141179575144, + "loss": 0.0967, + "step": 130 + }, + { + "epoch": 0.03559422582558829, + "grad_norm": 0.2734375, + "learning_rate": 0.00019656946826758148, + "loss": 0.0132, + "step": 135 + }, + { + "epoch": 0.036912530485795264, + "grad_norm": 0.66015625, + "learning_rate": 0.00019643752473941155, + "loss": 0.0396, + "step": 140 + }, + { + "epoch": 0.03823083514600224, + "grad_norm": 1.0546875, + "learning_rate": 0.0001963055812112416, + "loss": 0.0449, + "step": 145 + }, + { + "epoch": 0.039549139806209214, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019617363768307166, + "loss": 0.1196, + "step": 150 + }, + { + "epoch": 0.040867444466416186, + "grad_norm": 0.5859375, + "learning_rate": 0.0001960416941549017, + "loss": 0.0588, + "step": 155 + }, + { + "epoch": 0.042185749126623165, + "grad_norm": 0.06005859375, + "learning_rate": 0.00019590975062673175, + "loss": 0.0234, + "step": 160 + }, + { + "epoch": 0.04350405378683014, + "grad_norm": 0.4921875, + "learning_rate": 0.00019577780709856182, + "loss": 0.0916, + "step": 165 + }, + { + "epoch": 0.04482235844703711, + "grad_norm": 0.84375, + "learning_rate": 0.0001956458635703919, + "loss": 0.0271, + "step": 170 + }, + { + "epoch": 0.04614066310724409, + "grad_norm": 0.8828125, + "learning_rate": 0.00019551392004222193, + "loss": 0.0175, + "step": 175 + }, + { + "epoch": 0.04745896776745106, + "grad_norm": 0.0152587890625, + "learning_rate": 0.000195381976514052, + "loss": 0.0356, + "step": 180 + }, + { + "epoch": 0.04877727242765803, + "grad_norm": 0.09326171875, + "learning_rate": 0.00019525003298588204, + "loss": 0.0057, + "step": 185 + }, + { + "epoch": 0.050095577087865, + "grad_norm": 0.24609375, + "learning_rate": 0.0001951180894577121, + "loss": 0.0082, + "step": 190 + }, + { + "epoch": 0.05141388174807198, + "grad_norm": 0.05029296875, + "learning_rate": 0.00019498614592954215, + "loss": 0.0178, + "step": 195 + }, + { + "epoch": 0.05273218640827895, + "grad_norm": 0.0390625, + "learning_rate": 0.00019485420240137222, + "loss": 0.0789, + "step": 200 + }, + { + "epoch": 0.054050491068485924, + "grad_norm": 0.5625, + "learning_rate": 0.0001947222588732023, + "loss": 0.0645, + "step": 205 + }, + { + "epoch": 0.0553687957286929, + "grad_norm": 0.53515625, + "learning_rate": 0.00019459031534503233, + "loss": 0.116, + "step": 210 + }, + { + "epoch": 0.056687100388899875, + "grad_norm": 0.55078125, + "learning_rate": 0.0001944583718168624, + "loss": 0.0516, + "step": 215 + }, + { + "epoch": 0.058005405049106847, + "grad_norm": 0.314453125, + "learning_rate": 0.00019432642828869244, + "loss": 0.1019, + "step": 220 + }, + { + "epoch": 0.059323709709313825, + "grad_norm": 0.1123046875, + "learning_rate": 0.0001941944847605225, + "loss": 0.0529, + "step": 225 + }, + { + "epoch": 0.0606420143695208, + "grad_norm": 0.4921875, + "learning_rate": 0.00019406254123235256, + "loss": 0.0368, + "step": 230 + }, + { + "epoch": 0.06196031902972777, + "grad_norm": 0.054443359375, + "learning_rate": 0.00019393059770418262, + "loss": 0.037, + "step": 235 + }, + { + "epoch": 0.06327862368993474, + "grad_norm": 0.008544921875, + "learning_rate": 0.0001937986541760127, + "loss": 0.0324, + "step": 240 + }, + { + "epoch": 0.06459692835014172, + "grad_norm": 1.5, + "learning_rate": 0.00019366671064784274, + "loss": 0.0334, + "step": 245 + }, + { + "epoch": 0.0659152330103487, + "grad_norm": 0.2109375, + "learning_rate": 0.0001935347671196728, + "loss": 0.0671, + "step": 250 + }, + { + "epoch": 0.06723353767055566, + "grad_norm": 2.0625, + "learning_rate": 0.00019340282359150285, + "loss": 0.1559, + "step": 255 + }, + { + "epoch": 0.06855184233076264, + "grad_norm": 0.7734375, + "learning_rate": 0.0001932708800633329, + "loss": 0.0198, + "step": 260 + }, + { + "epoch": 0.06987014699096962, + "grad_norm": 0.42578125, + "learning_rate": 0.00019313893653516296, + "loss": 0.0151, + "step": 265 + }, + { + "epoch": 0.07118845165117658, + "grad_norm": 0.1884765625, + "learning_rate": 0.000193006993006993, + "loss": 0.0269, + "step": 270 + }, + { + "epoch": 0.07250675631138356, + "grad_norm": 1.546875, + "learning_rate": 0.00019287504947882307, + "loss": 0.0565, + "step": 275 + }, + { + "epoch": 0.07382506097159053, + "grad_norm": 0.5078125, + "learning_rate": 0.0001927431059506531, + "loss": 0.0942, + "step": 280 + }, + { + "epoch": 0.0751433656317975, + "grad_norm": 0.392578125, + "learning_rate": 0.00019261116242248318, + "loss": 0.0061, + "step": 285 + }, + { + "epoch": 0.07646167029200449, + "grad_norm": 1.9140625, + "learning_rate": 0.00019247921889431325, + "loss": 0.0497, + "step": 290 + }, + { + "epoch": 0.07777997495221145, + "grad_norm": 0.08837890625, + "learning_rate": 0.0001923472753661433, + "loss": 0.0573, + "step": 295 + }, + { + "epoch": 0.07909827961241843, + "grad_norm": 1.046875, + "learning_rate": 0.00019221533183797336, + "loss": 0.0528, + "step": 300 + }, + { + "epoch": 0.08041658427262541, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001920833883098034, + "loss": 0.0506, + "step": 305 + }, + { + "epoch": 0.08173488893283237, + "grad_norm": 0.08203125, + "learning_rate": 0.00019195144478163347, + "loss": 0.0307, + "step": 310 + }, + { + "epoch": 0.08305319359303935, + "grad_norm": 0.111328125, + "learning_rate": 0.00019181950125346354, + "loss": 0.0365, + "step": 315 + }, + { + "epoch": 0.08437149825324633, + "grad_norm": 1.2890625, + "learning_rate": 0.00019168755772529358, + "loss": 0.0447, + "step": 320 + }, + { + "epoch": 0.0856898029134533, + "grad_norm": 0.6015625, + "learning_rate": 0.00019155561419712365, + "loss": 0.0605, + "step": 325 + }, + { + "epoch": 0.08700810757366027, + "grad_norm": 0.71875, + "learning_rate": 0.0001914236706689537, + "loss": 0.0846, + "step": 330 + }, + { + "epoch": 0.08832641223386725, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019129172714078376, + "loss": 0.0713, + "step": 335 + }, + { + "epoch": 0.08964471689407422, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001911597836126138, + "loss": 0.0826, + "step": 340 + }, + { + "epoch": 0.0909630215542812, + "grad_norm": 2.203125, + "learning_rate": 0.00019102784008444388, + "loss": 0.0441, + "step": 345 + }, + { + "epoch": 0.09228132621448817, + "grad_norm": 1.21875, + "learning_rate": 0.00019089589655627395, + "loss": 0.1378, + "step": 350 + }, + { + "epoch": 0.09359963087469514, + "grad_norm": 3.0625, + "learning_rate": 0.00019076395302810396, + "loss": 0.1552, + "step": 355 + }, + { + "epoch": 0.09491793553490212, + "grad_norm": 0.232421875, + "learning_rate": 0.00019063200949993403, + "loss": 0.0458, + "step": 360 + }, + { + "epoch": 0.0962362401951091, + "grad_norm": 0.71875, + "learning_rate": 0.0001905000659717641, + "loss": 0.0312, + "step": 365 + }, + { + "epoch": 0.09755454485531606, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00019036812244359414, + "loss": 0.0247, + "step": 370 + }, + { + "epoch": 0.09887284951552304, + "grad_norm": 0.064453125, + "learning_rate": 0.0001902361789154242, + "loss": 0.054, + "step": 375 + }, + { + "epoch": 0.10019115417573, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019010423538725425, + "loss": 0.0023, + "step": 380 + }, + { + "epoch": 0.10150945883593698, + "grad_norm": 0.0361328125, + "learning_rate": 0.00018997229185908432, + "loss": 0.0884, + "step": 385 + }, + { + "epoch": 0.10282776349614396, + "grad_norm": 1.703125, + "learning_rate": 0.00018984034833091436, + "loss": 0.0506, + "step": 390 + }, + { + "epoch": 0.10414606815635093, + "grad_norm": 0.08837890625, + "learning_rate": 0.00018970840480274443, + "loss": 0.1123, + "step": 395 + }, + { + "epoch": 0.1054643728165579, + "grad_norm": 0.6953125, + "learning_rate": 0.0001895764612745745, + "loss": 0.0597, + "step": 400 + }, + { + "epoch": 0.10678267747676488, + "grad_norm": 0.18359375, + "learning_rate": 0.00018944451774640454, + "loss": 0.0138, + "step": 405 + }, + { + "epoch": 0.10810098213697185, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001893125742182346, + "loss": 0.0249, + "step": 410 + }, + { + "epoch": 0.10941928679717883, + "grad_norm": 0.00970458984375, + "learning_rate": 0.00018918063069006466, + "loss": 0.0084, + "step": 415 + }, + { + "epoch": 0.1107375914573858, + "grad_norm": 0.54296875, + "learning_rate": 0.00018904868716189472, + "loss": 0.0541, + "step": 420 + }, + { + "epoch": 0.11205589611759277, + "grad_norm": 0.74609375, + "learning_rate": 0.00018891674363372477, + "loss": 0.007, + "step": 425 + }, + { + "epoch": 0.11337420077779975, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00018878480010555484, + "loss": 0.0875, + "step": 430 + }, + { + "epoch": 0.11469250543800673, + "grad_norm": 0.9296875, + "learning_rate": 0.0001886528565773849, + "loss": 0.1207, + "step": 435 + }, + { + "epoch": 0.11601081009821369, + "grad_norm": 1.2734375, + "learning_rate": 0.00018852091304921495, + "loss": 0.1143, + "step": 440 + }, + { + "epoch": 0.11732911475842067, + "grad_norm": 0.6484375, + "learning_rate": 0.00018838896952104502, + "loss": 0.0393, + "step": 445 + }, + { + "epoch": 0.11864741941862765, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018825702599287506, + "loss": 0.02, + "step": 450 + }, + { + "epoch": 0.11996572407883462, + "grad_norm": 0.486328125, + "learning_rate": 0.0001881250824647051, + "loss": 0.0891, + "step": 455 + }, + { + "epoch": 0.1212840287390416, + "grad_norm": 1.0, + "learning_rate": 0.00018799313893653517, + "loss": 0.0469, + "step": 460 + }, + { + "epoch": 0.12260233339924857, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001878611954083652, + "loss": 0.019, + "step": 465 + }, + { + "epoch": 0.12392063805945554, + "grad_norm": 0.03857421875, + "learning_rate": 0.00018772925188019528, + "loss": 0.007, + "step": 470 + }, + { + "epoch": 0.12523894271966252, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00018759730835202532, + "loss": 0.0039, + "step": 475 + }, + { + "epoch": 0.12655724737986948, + "grad_norm": 0.014404296875, + "learning_rate": 0.0001874653648238554, + "loss": 0.0043, + "step": 480 + }, + { + "epoch": 0.12787555204007647, + "grad_norm": 0.51953125, + "learning_rate": 0.00018733342129568546, + "loss": 0.1326, + "step": 485 + }, + { + "epoch": 0.12919385670028344, + "grad_norm": 0.99609375, + "learning_rate": 0.0001872014777675155, + "loss": 0.0369, + "step": 490 + }, + { + "epoch": 0.1305121613604904, + "grad_norm": 0.2734375, + "learning_rate": 0.00018706953423934557, + "loss": 0.0395, + "step": 495 + }, + { + "epoch": 0.1318304660206974, + "grad_norm": 0.083984375, + "learning_rate": 0.00018693759071117561, + "loss": 0.0284, + "step": 500 + }, + { + "epoch": 0.1318304660206974, + "eval_loss": 0.04542969539761543, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.5293, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 3.685, + "step": 500 + }, + { + "epoch": 0.13314877068090436, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00018680564718300568, + "loss": 0.0533, + "step": 505 + }, + { + "epoch": 0.13446707534111133, + "grad_norm": 0.71484375, + "learning_rate": 0.00018667370365483575, + "loss": 0.0183, + "step": 510 + }, + { + "epoch": 0.13578538000131832, + "grad_norm": 0.018798828125, + "learning_rate": 0.0001865417601266658, + "loss": 0.0473, + "step": 515 + }, + { + "epoch": 0.13710368466152528, + "grad_norm": 0.388671875, + "learning_rate": 0.00018640981659849586, + "loss": 0.0562, + "step": 520 + }, + { + "epoch": 0.13842198932173225, + "grad_norm": 0.77734375, + "learning_rate": 0.0001862778730703259, + "loss": 0.0755, + "step": 525 + }, + { + "epoch": 0.13974029398193924, + "grad_norm": 2.8125, + "learning_rate": 0.00018614592954215598, + "loss": 0.0422, + "step": 530 + }, + { + "epoch": 0.1410585986421462, + "grad_norm": 0.48828125, + "learning_rate": 0.00018601398601398602, + "loss": 0.0882, + "step": 535 + }, + { + "epoch": 0.14237690330235317, + "grad_norm": 0.16015625, + "learning_rate": 0.0001858820424858161, + "loss": 0.0131, + "step": 540 + }, + { + "epoch": 0.14369520796256013, + "grad_norm": 0.31640625, + "learning_rate": 0.00018575009895764616, + "loss": 0.03, + "step": 545 + }, + { + "epoch": 0.14501351262276713, + "grad_norm": 0.0120849609375, + "learning_rate": 0.0001856181554294762, + "loss": 0.0425, + "step": 550 + }, + { + "epoch": 0.1463318172829741, + "grad_norm": 0.390625, + "learning_rate": 0.00018548621190130624, + "loss": 0.011, + "step": 555 + }, + { + "epoch": 0.14765012194318106, + "grad_norm": 1.9609375, + "learning_rate": 0.0001853542683731363, + "loss": 0.0807, + "step": 560 + }, + { + "epoch": 0.14896842660338805, + "grad_norm": 0.609375, + "learning_rate": 0.00018522232484496635, + "loss": 0.0278, + "step": 565 + }, + { + "epoch": 0.150286731263595, + "grad_norm": 0.087890625, + "learning_rate": 0.00018509038131679642, + "loss": 0.0484, + "step": 570 + }, + { + "epoch": 0.15160503592380198, + "grad_norm": 0.5078125, + "learning_rate": 0.00018495843778862646, + "loss": 0.1277, + "step": 575 + }, + { + "epoch": 0.15292334058400897, + "grad_norm": 0.8125, + "learning_rate": 0.00018482649426045653, + "loss": 0.058, + "step": 580 + }, + { + "epoch": 0.15424164524421594, + "grad_norm": 0.22265625, + "learning_rate": 0.00018469455073228657, + "loss": 0.0259, + "step": 585 + }, + { + "epoch": 0.1555599499044229, + "grad_norm": 1.8984375, + "learning_rate": 0.00018456260720411664, + "loss": 0.113, + "step": 590 + }, + { + "epoch": 0.1568782545646299, + "grad_norm": 0.12451171875, + "learning_rate": 0.0001844306636759467, + "loss": 0.0312, + "step": 595 + }, + { + "epoch": 0.15819655922483686, + "grad_norm": 0.0322265625, + "learning_rate": 0.00018429872014777676, + "loss": 0.0476, + "step": 600 + }, + { + "epoch": 0.15951486388504382, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00018416677661960682, + "loss": 0.0232, + "step": 605 + }, + { + "epoch": 0.16083316854525082, + "grad_norm": 0.57421875, + "learning_rate": 0.00018403483309143687, + "loss": 0.1287, + "step": 610 + }, + { + "epoch": 0.16215147320545778, + "grad_norm": 0.765625, + "learning_rate": 0.00018390288956326694, + "loss": 0.0991, + "step": 615 + }, + { + "epoch": 0.16346977786566474, + "grad_norm": 0.3125, + "learning_rate": 0.00018377094603509698, + "loss": 0.0247, + "step": 620 + }, + { + "epoch": 0.16478808252587174, + "grad_norm": 0.37890625, + "learning_rate": 0.00018363900250692705, + "loss": 0.0632, + "step": 625 + }, + { + "epoch": 0.1661063871860787, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018350705897875712, + "loss": 0.0314, + "step": 630 + }, + { + "epoch": 0.16742469184628567, + "grad_norm": 0.0673828125, + "learning_rate": 0.00018337511545058716, + "loss": 0.0425, + "step": 635 + }, + { + "epoch": 0.16874299650649266, + "grad_norm": 0.396484375, + "learning_rate": 0.00018324317192241723, + "loss": 0.0613, + "step": 640 + }, + { + "epoch": 0.17006130116669962, + "grad_norm": 0.057373046875, + "learning_rate": 0.00018311122839424727, + "loss": 0.0569, + "step": 645 + }, + { + "epoch": 0.1713796058269066, + "grad_norm": 0.001373291015625, + "learning_rate": 0.00018297928486607734, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 0.17269791048711358, + "grad_norm": 1.0859375, + "learning_rate": 0.00018284734133790738, + "loss": 0.0189, + "step": 655 + }, + { + "epoch": 0.17401621514732055, + "grad_norm": 0.6015625, + "learning_rate": 0.00018271539780973742, + "loss": 0.0601, + "step": 660 + }, + { + "epoch": 0.1753345198075275, + "grad_norm": 0.25390625, + "learning_rate": 0.0001825834542815675, + "loss": 0.0211, + "step": 665 + }, + { + "epoch": 0.1766528244677345, + "grad_norm": 2.6875, + "learning_rate": 0.00018245151075339753, + "loss": 0.0713, + "step": 670 + }, + { + "epoch": 0.17797112912794147, + "grad_norm": 1.1875, + "learning_rate": 0.0001823195672252276, + "loss": 0.0522, + "step": 675 + }, + { + "epoch": 0.17928943378814843, + "grad_norm": 0.025146484375, + "learning_rate": 0.00018218762369705767, + "loss": 0.0242, + "step": 680 + }, + { + "epoch": 0.18060773844835543, + "grad_norm": 0.048095703125, + "learning_rate": 0.00018205568016888772, + "loss": 0.0129, + "step": 685 + }, + { + "epoch": 0.1819260431085624, + "grad_norm": 0.04541015625, + "learning_rate": 0.00018192373664071778, + "loss": 0.0142, + "step": 690 + }, + { + "epoch": 0.18324434776876936, + "grad_norm": 0.00830078125, + "learning_rate": 0.00018179179311254783, + "loss": 0.0121, + "step": 695 + }, + { + "epoch": 0.18456265242897635, + "grad_norm": 0.53125, + "learning_rate": 0.0001816598495843779, + "loss": 0.0163, + "step": 700 + }, + { + "epoch": 0.1858809570891833, + "grad_norm": 0.185546875, + "learning_rate": 0.00018152790605620796, + "loss": 0.0203, + "step": 705 + }, + { + "epoch": 0.18719926174939028, + "grad_norm": 1.2578125, + "learning_rate": 0.000181395962528038, + "loss": 0.1548, + "step": 710 + }, + { + "epoch": 0.18851756640959727, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00018126401899986808, + "loss": 0.0543, + "step": 715 + }, + { + "epoch": 0.18983587106980424, + "grad_norm": 0.07568359375, + "learning_rate": 0.00018113207547169812, + "loss": 0.0346, + "step": 720 + }, + { + "epoch": 0.1911541757300112, + "grad_norm": 0.1318359375, + "learning_rate": 0.0001810001319435282, + "loss": 0.03, + "step": 725 + }, + { + "epoch": 0.1924724803902182, + "grad_norm": 0.1455078125, + "learning_rate": 0.00018086818841535823, + "loss": 0.0796, + "step": 730 + }, + { + "epoch": 0.19379078505042516, + "grad_norm": 0.09814453125, + "learning_rate": 0.0001807362448871883, + "loss": 0.0662, + "step": 735 + }, + { + "epoch": 0.19510908971063212, + "grad_norm": 0.91015625, + "learning_rate": 0.00018060430135901837, + "loss": 0.0675, + "step": 740 + }, + { + "epoch": 0.19642739437083911, + "grad_norm": 0.10693359375, + "learning_rate": 0.0001804723578308484, + "loss": 0.0377, + "step": 745 + }, + { + "epoch": 0.19774569903104608, + "grad_norm": 0.95703125, + "learning_rate": 0.00018034041430267848, + "loss": 0.0174, + "step": 750 + }, + { + "epoch": 0.19906400369125304, + "grad_norm": 1.7890625, + "learning_rate": 0.00018020847077450852, + "loss": 0.0278, + "step": 755 + }, + { + "epoch": 0.20038230835146, + "grad_norm": 0.8515625, + "learning_rate": 0.00018007652724633856, + "loss": 0.0113, + "step": 760 + }, + { + "epoch": 0.201700613011667, + "grad_norm": 0.016845703125, + "learning_rate": 0.00017994458371816863, + "loss": 0.0589, + "step": 765 + }, + { + "epoch": 0.20301891767187397, + "grad_norm": 0.01043701171875, + "learning_rate": 0.00017981264018999867, + "loss": 0.0203, + "step": 770 + }, + { + "epoch": 0.20433722233208093, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00017968069666182874, + "loss": 0.0494, + "step": 775 + }, + { + "epoch": 0.20565552699228792, + "grad_norm": 0.56640625, + "learning_rate": 0.00017954875313365879, + "loss": 0.0394, + "step": 780 + }, + { + "epoch": 0.2069738316524949, + "grad_norm": 0.06591796875, + "learning_rate": 0.00017941680960548886, + "loss": 0.0848, + "step": 785 + }, + { + "epoch": 0.20829213631270185, + "grad_norm": 0.40234375, + "learning_rate": 0.00017928486607731892, + "loss": 0.0464, + "step": 790 + }, + { + "epoch": 0.20961044097290885, + "grad_norm": 0.06298828125, + "learning_rate": 0.00017915292254914897, + "loss": 0.0222, + "step": 795 + }, + { + "epoch": 0.2109287456331158, + "grad_norm": 0.5390625, + "learning_rate": 0.00017902097902097904, + "loss": 0.0434, + "step": 800 + }, + { + "epoch": 0.21224705029332278, + "grad_norm": 1.390625, + "learning_rate": 0.00017888903549280908, + "loss": 0.0222, + "step": 805 + }, + { + "epoch": 0.21356535495352977, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00017875709196463915, + "loss": 0.0099, + "step": 810 + }, + { + "epoch": 0.21488365961373673, + "grad_norm": 0.10009765625, + "learning_rate": 0.0001786251484364692, + "loss": 0.0086, + "step": 815 + }, + { + "epoch": 0.2162019642739437, + "grad_norm": 0.06396484375, + "learning_rate": 0.00017849320490829926, + "loss": 0.0715, + "step": 820 + }, + { + "epoch": 0.2175202689341507, + "grad_norm": 0.365234375, + "learning_rate": 0.00017836126138012933, + "loss": 0.0642, + "step": 825 + }, + { + "epoch": 0.21883857359435765, + "grad_norm": 0.01519775390625, + "learning_rate": 0.00017822931785195937, + "loss": 0.0111, + "step": 830 + }, + { + "epoch": 0.22015687825456462, + "grad_norm": 1.1640625, + "learning_rate": 0.00017809737432378944, + "loss": 0.0518, + "step": 835 + }, + { + "epoch": 0.2214751829147716, + "grad_norm": 0.00921630859375, + "learning_rate": 0.00017796543079561948, + "loss": 0.0384, + "step": 840 + }, + { + "epoch": 0.22279348757497858, + "grad_norm": 0.33984375, + "learning_rate": 0.00017783348726744955, + "loss": 0.0204, + "step": 845 + }, + { + "epoch": 0.22411179223518554, + "grad_norm": 0.294921875, + "learning_rate": 0.00017770154373927962, + "loss": 0.0075, + "step": 850 + }, + { + "epoch": 0.22543009689539253, + "grad_norm": 0.033203125, + "learning_rate": 0.00017756960021110963, + "loss": 0.0895, + "step": 855 + }, + { + "epoch": 0.2267484015555995, + "grad_norm": 0.08056640625, + "learning_rate": 0.0001774376566829397, + "loss": 0.1039, + "step": 860 + }, + { + "epoch": 0.22806670621580646, + "grad_norm": 0.55078125, + "learning_rate": 0.00017730571315476975, + "loss": 0.0125, + "step": 865 + }, + { + "epoch": 0.22938501087601346, + "grad_norm": 0.5859375, + "learning_rate": 0.00017717376962659982, + "loss": 0.0381, + "step": 870 + }, + { + "epoch": 0.23070331553622042, + "grad_norm": 0.029052734375, + "learning_rate": 0.00017704182609842988, + "loss": 0.0434, + "step": 875 + }, + { + "epoch": 0.23202162019642739, + "grad_norm": 0.43359375, + "learning_rate": 0.00017690988257025993, + "loss": 0.0799, + "step": 880 + }, + { + "epoch": 0.23333992485663438, + "grad_norm": 0.04150390625, + "learning_rate": 0.00017677793904209, + "loss": 0.0692, + "step": 885 + }, + { + "epoch": 0.23465822951684134, + "grad_norm": 0.435546875, + "learning_rate": 0.00017664599551392004, + "loss": 0.0544, + "step": 890 + }, + { + "epoch": 0.2359765341770483, + "grad_norm": 1.171875, + "learning_rate": 0.0001765140519857501, + "loss": 0.0619, + "step": 895 + }, + { + "epoch": 0.2372948388372553, + "grad_norm": 0.01263427734375, + "learning_rate": 0.00017638210845758018, + "loss": 0.0418, + "step": 900 + }, + { + "epoch": 0.23861314349746227, + "grad_norm": 0.017578125, + "learning_rate": 0.00017625016492941022, + "loss": 0.0195, + "step": 905 + }, + { + "epoch": 0.23993144815766923, + "grad_norm": 0.6171875, + "learning_rate": 0.0001761182214012403, + "loss": 0.067, + "step": 910 + }, + { + "epoch": 0.24124975281787622, + "grad_norm": 0.59765625, + "learning_rate": 0.00017598627787307033, + "loss": 0.049, + "step": 915 + }, + { + "epoch": 0.2425680574780832, + "grad_norm": 1.2421875, + "learning_rate": 0.0001758543343449004, + "loss": 0.0539, + "step": 920 + }, + { + "epoch": 0.24388636213829015, + "grad_norm": 0.10302734375, + "learning_rate": 0.00017572239081673044, + "loss": 0.0725, + "step": 925 + }, + { + "epoch": 0.24520466679849715, + "grad_norm": 0.330078125, + "learning_rate": 0.0001755904472885605, + "loss": 0.064, + "step": 930 + }, + { + "epoch": 0.2465229714587041, + "grad_norm": 0.220703125, + "learning_rate": 0.00017545850376039058, + "loss": 0.0271, + "step": 935 + }, + { + "epoch": 0.24784127611891107, + "grad_norm": 0.01470947265625, + "learning_rate": 0.00017532656023222062, + "loss": 0.0247, + "step": 940 + }, + { + "epoch": 0.24915958077911807, + "grad_norm": 0.013427734375, + "learning_rate": 0.0001751946167040507, + "loss": 0.017, + "step": 945 + }, + { + "epoch": 0.25047788543932503, + "grad_norm": 0.58984375, + "learning_rate": 0.00017506267317588073, + "loss": 0.0254, + "step": 950 + }, + { + "epoch": 0.251796190099532, + "grad_norm": 0.412109375, + "learning_rate": 0.00017493072964771078, + "loss": 0.0186, + "step": 955 + }, + { + "epoch": 0.25311449475973896, + "grad_norm": 0.66796875, + "learning_rate": 0.00017479878611954084, + "loss": 0.0617, + "step": 960 + }, + { + "epoch": 0.25443279941994595, + "grad_norm": 0.322265625, + "learning_rate": 0.00017466684259137089, + "loss": 0.0173, + "step": 965 + }, + { + "epoch": 0.25575110408015295, + "grad_norm": 0.83203125, + "learning_rate": 0.00017453489906320096, + "loss": 0.0512, + "step": 970 + }, + { + "epoch": 0.2570694087403599, + "grad_norm": 0.08447265625, + "learning_rate": 0.000174402955535031, + "loss": 0.0361, + "step": 975 + }, + { + "epoch": 0.2583877134005669, + "grad_norm": 0.423828125, + "learning_rate": 0.00017427101200686107, + "loss": 0.0175, + "step": 980 + }, + { + "epoch": 0.25970601806077387, + "grad_norm": 0.77734375, + "learning_rate": 0.00017413906847869114, + "loss": 0.0139, + "step": 985 + }, + { + "epoch": 0.2610243227209808, + "grad_norm": 0.515625, + "learning_rate": 0.00017400712495052118, + "loss": 0.0948, + "step": 990 + }, + { + "epoch": 0.2623426273811878, + "grad_norm": 1.421875, + "learning_rate": 0.00017387518142235125, + "loss": 0.0406, + "step": 995 + }, + { + "epoch": 0.2636609320413948, + "grad_norm": 0.058837890625, + "learning_rate": 0.0001737432378941813, + "loss": 0.1011, + "step": 1000 + }, + { + "epoch": 0.2636609320413948, + "eval_loss": 0.045552924275398254, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.6113, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 3.684, + "step": 1000 + }, + { + "epoch": 0.26497923670160173, + "grad_norm": 0.380859375, + "learning_rate": 0.00017361129436601136, + "loss": 0.0711, + "step": 1005 + }, + { + "epoch": 0.2662975413618087, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00017347935083784143, + "loss": 0.0218, + "step": 1010 + }, + { + "epoch": 0.2676158460220157, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017334740730967147, + "loss": 0.0301, + "step": 1015 + }, + { + "epoch": 0.26893415068222265, + "grad_norm": 0.2734375, + "learning_rate": 0.00017321546378150154, + "loss": 0.0721, + "step": 1020 + }, + { + "epoch": 0.27025245534242964, + "grad_norm": 0.25390625, + "learning_rate": 0.00017308352025333158, + "loss": 0.0363, + "step": 1025 + }, + { + "epoch": 0.27157076000263664, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017295157672516165, + "loss": 0.0313, + "step": 1030 + }, + { + "epoch": 0.2728890646628436, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0001728196331969917, + "loss": 0.0385, + "step": 1035 + }, + { + "epoch": 0.27420736932305056, + "grad_norm": 0.00787353515625, + "learning_rate": 0.00017268768966882176, + "loss": 0.0405, + "step": 1040 + }, + { + "epoch": 0.27552567398325756, + "grad_norm": 0.484375, + "learning_rate": 0.00017255574614065183, + "loss": 0.0616, + "step": 1045 + }, + { + "epoch": 0.2768439786434645, + "grad_norm": 0.0908203125, + "learning_rate": 0.00017242380261248185, + "loss": 0.0057, + "step": 1050 + }, + { + "epoch": 0.2781622833036715, + "grad_norm": 0.1904296875, + "learning_rate": 0.00017229185908431192, + "loss": 0.0417, + "step": 1055 + }, + { + "epoch": 0.2794805879638785, + "grad_norm": 0.30078125, + "learning_rate": 0.00017215991555614196, + "loss": 0.0346, + "step": 1060 + }, + { + "epoch": 0.2807988926240854, + "grad_norm": 0.016357421875, + "learning_rate": 0.00017202797202797203, + "loss": 0.0295, + "step": 1065 + }, + { + "epoch": 0.2821171972842924, + "grad_norm": 0.490234375, + "learning_rate": 0.0001718960284998021, + "loss": 0.0448, + "step": 1070 + }, + { + "epoch": 0.28343550194449935, + "grad_norm": 0.004241943359375, + "learning_rate": 0.00017176408497163214, + "loss": 0.0051, + "step": 1075 + }, + { + "epoch": 0.28475380660470634, + "grad_norm": 0.01904296875, + "learning_rate": 0.0001716321414434622, + "loss": 0.0894, + "step": 1080 + }, + { + "epoch": 0.28607211126491333, + "grad_norm": 0.83984375, + "learning_rate": 0.00017150019791529225, + "loss": 0.0288, + "step": 1085 + }, + { + "epoch": 0.28739041592512027, + "grad_norm": 0.2021484375, + "learning_rate": 0.00017136825438712232, + "loss": 0.0222, + "step": 1090 + }, + { + "epoch": 0.28870872058532726, + "grad_norm": 0.322265625, + "learning_rate": 0.0001712363108589524, + "loss": 0.0444, + "step": 1095 + }, + { + "epoch": 0.29002702524553425, + "grad_norm": 0.408203125, + "learning_rate": 0.00017110436733078243, + "loss": 0.0828, + "step": 1100 + }, + { + "epoch": 0.2913453299057412, + "grad_norm": 0.04052734375, + "learning_rate": 0.0001709724238026125, + "loss": 0.0725, + "step": 1105 + }, + { + "epoch": 0.2926636345659482, + "grad_norm": 0.2578125, + "learning_rate": 0.00017084048027444254, + "loss": 0.0204, + "step": 1110 + }, + { + "epoch": 0.2939819392261552, + "grad_norm": 0.67578125, + "learning_rate": 0.0001707085367462726, + "loss": 0.0503, + "step": 1115 + }, + { + "epoch": 0.2953002438863621, + "grad_norm": 0.0059814453125, + "learning_rate": 0.00017057659321810265, + "loss": 0.0144, + "step": 1120 + }, + { + "epoch": 0.2966185485465691, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00017044464968993272, + "loss": 0.0044, + "step": 1125 + }, + { + "epoch": 0.2979368532067761, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001703127061617628, + "loss": 0.013, + "step": 1130 + }, + { + "epoch": 0.29925515786698303, + "grad_norm": 0.287109375, + "learning_rate": 0.00017018076263359283, + "loss": 0.0245, + "step": 1135 + }, + { + "epoch": 0.30057346252719, + "grad_norm": 0.26171875, + "learning_rate": 0.0001700488191054229, + "loss": 0.0247, + "step": 1140 + }, + { + "epoch": 0.301891767187397, + "grad_norm": 0.40625, + "learning_rate": 0.00016991687557725294, + "loss": 0.0402, + "step": 1145 + }, + { + "epoch": 0.30321007184760396, + "grad_norm": 1.2578125, + "learning_rate": 0.000169784932049083, + "loss": 0.0071, + "step": 1150 + }, + { + "epoch": 0.30452837650781095, + "grad_norm": 0.330078125, + "learning_rate": 0.00016965298852091306, + "loss": 0.0177, + "step": 1155 + }, + { + "epoch": 0.30584668116801794, + "grad_norm": 0.07275390625, + "learning_rate": 0.0001695210449927431, + "loss": 0.0029, + "step": 1160 + }, + { + "epoch": 0.3071649858282249, + "grad_norm": 0.455078125, + "learning_rate": 0.00016938910146457317, + "loss": 0.0262, + "step": 1165 + }, + { + "epoch": 0.30848329048843187, + "grad_norm": 0.002655029296875, + "learning_rate": 0.0001692571579364032, + "loss": 0.0346, + "step": 1170 + }, + { + "epoch": 0.30980159514863886, + "grad_norm": 0.1748046875, + "learning_rate": 0.00016912521440823328, + "loss": 0.0494, + "step": 1175 + }, + { + "epoch": 0.3111198998088458, + "grad_norm": 1.4609375, + "learning_rate": 0.00016899327088006335, + "loss": 0.0603, + "step": 1180 + }, + { + "epoch": 0.3124382044690528, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001688613273518934, + "loss": 0.0366, + "step": 1185 + }, + { + "epoch": 0.3137565091292598, + "grad_norm": 0.01422119140625, + "learning_rate": 0.00016872938382372346, + "loss": 0.0678, + "step": 1190 + }, + { + "epoch": 0.3150748137894667, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001685974402955535, + "loss": 0.0359, + "step": 1195 + }, + { + "epoch": 0.3163931184496737, + "grad_norm": 0.275390625, + "learning_rate": 0.00016846549676738357, + "loss": 0.1099, + "step": 1200 + }, + { + "epoch": 0.3177114231098807, + "grad_norm": 0.212890625, + "learning_rate": 0.00016833355323921364, + "loss": 0.0343, + "step": 1205 + }, + { + "epoch": 0.31902972777008765, + "grad_norm": 0.0302734375, + "learning_rate": 0.00016820160971104368, + "loss": 0.0138, + "step": 1210 + }, + { + "epoch": 0.32034803243029464, + "grad_norm": 0.016845703125, + "learning_rate": 0.00016806966618287375, + "loss": 0.0202, + "step": 1215 + }, + { + "epoch": 0.32166633709050163, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001679377226547038, + "loss": 0.0442, + "step": 1220 + }, + { + "epoch": 0.32298464175070857, + "grad_norm": 0.049072265625, + "learning_rate": 0.00016780577912653386, + "loss": 0.0375, + "step": 1225 + }, + { + "epoch": 0.32430294641091556, + "grad_norm": 0.1337890625, + "learning_rate": 0.0001676738355983639, + "loss": 0.01, + "step": 1230 + }, + { + "epoch": 0.32562125107112255, + "grad_norm": 0.02197265625, + "learning_rate": 0.00016754189207019397, + "loss": 0.0139, + "step": 1235 + }, + { + "epoch": 0.3269395557313295, + "grad_norm": 0.09228515625, + "learning_rate": 0.00016740994854202404, + "loss": 0.014, + "step": 1240 + }, + { + "epoch": 0.3282578603915365, + "grad_norm": 0.47265625, + "learning_rate": 0.00016727800501385408, + "loss": 0.1546, + "step": 1245 + }, + { + "epoch": 0.3295761650517435, + "grad_norm": 0.02294921875, + "learning_rate": 0.00016714606148568413, + "loss": 0.0803, + "step": 1250 + }, + { + "epoch": 0.3308944697119504, + "grad_norm": 0.185546875, + "learning_rate": 0.00016701411795751417, + "loss": 0.0376, + "step": 1255 + }, + { + "epoch": 0.3322127743721574, + "grad_norm": 0.1123046875, + "learning_rate": 0.00016688217442934424, + "loss": 0.0375, + "step": 1260 + }, + { + "epoch": 0.3335310790323644, + "grad_norm": 1.03125, + "learning_rate": 0.0001667502309011743, + "loss": 0.0442, + "step": 1265 + }, + { + "epoch": 0.33484938369257133, + "grad_norm": 0.0172119140625, + "learning_rate": 0.00016661828737300435, + "loss": 0.0261, + "step": 1270 + }, + { + "epoch": 0.3361676883527783, + "grad_norm": 0.42578125, + "learning_rate": 0.00016648634384483442, + "loss": 0.0553, + "step": 1275 + }, + { + "epoch": 0.3374859930129853, + "grad_norm": 0.1328125, + "learning_rate": 0.00016635440031666446, + "loss": 0.0065, + "step": 1280 + }, + { + "epoch": 0.33880429767319226, + "grad_norm": 0.263671875, + "learning_rate": 0.00016622245678849453, + "loss": 0.0527, + "step": 1285 + }, + { + "epoch": 0.34012260233339925, + "grad_norm": 0.314453125, + "learning_rate": 0.0001660905132603246, + "loss": 0.0297, + "step": 1290 + }, + { + "epoch": 0.34144090699360624, + "grad_norm": 0.04345703125, + "learning_rate": 0.00016595856973215464, + "loss": 0.0477, + "step": 1295 + }, + { + "epoch": 0.3427592116538132, + "grad_norm": 0.08154296875, + "learning_rate": 0.0001658266262039847, + "loss": 0.0298, + "step": 1300 + }, + { + "epoch": 0.34407751631402017, + "grad_norm": 0.08935546875, + "learning_rate": 0.00016569468267581475, + "loss": 0.0481, + "step": 1305 + }, + { + "epoch": 0.34539582097422716, + "grad_norm": 0.06640625, + "learning_rate": 0.00016556273914764482, + "loss": 0.0153, + "step": 1310 + }, + { + "epoch": 0.3467141256344341, + "grad_norm": 0.00592041015625, + "learning_rate": 0.00016543079561947486, + "loss": 0.0111, + "step": 1315 + }, + { + "epoch": 0.3480324302946411, + "grad_norm": 0.2236328125, + "learning_rate": 0.00016529885209130493, + "loss": 0.0309, + "step": 1320 + }, + { + "epoch": 0.3493507349548481, + "grad_norm": 0.0198974609375, + "learning_rate": 0.000165166908563135, + "loss": 0.0579, + "step": 1325 + }, + { + "epoch": 0.350669039615055, + "grad_norm": 0.10107421875, + "learning_rate": 0.00016503496503496504, + "loss": 0.0055, + "step": 1330 + }, + { + "epoch": 0.351987344275262, + "grad_norm": 0.71875, + "learning_rate": 0.00016490302150679511, + "loss": 0.0299, + "step": 1335 + }, + { + "epoch": 0.353305648935469, + "grad_norm": 0.01348876953125, + "learning_rate": 0.00016477107797862516, + "loss": 0.0943, + "step": 1340 + }, + { + "epoch": 0.35462395359567594, + "grad_norm": 0.3046875, + "learning_rate": 0.00016463913445045523, + "loss": 0.0216, + "step": 1345 + }, + { + "epoch": 0.35594225825588294, + "grad_norm": 0.02392578125, + "learning_rate": 0.00016450719092228527, + "loss": 0.0265, + "step": 1350 + }, + { + "epoch": 0.35726056291608993, + "grad_norm": 0.453125, + "learning_rate": 0.0001643752473941153, + "loss": 0.0539, + "step": 1355 + }, + { + "epoch": 0.35857886757629687, + "grad_norm": 0.00823974609375, + "learning_rate": 0.00016424330386594538, + "loss": 0.0139, + "step": 1360 + }, + { + "epoch": 0.35989717223650386, + "grad_norm": 0.55859375, + "learning_rate": 0.00016411136033777542, + "loss": 0.0428, + "step": 1365 + }, + { + "epoch": 0.36121547689671085, + "grad_norm": 0.052734375, + "learning_rate": 0.0001639794168096055, + "loss": 0.0346, + "step": 1370 + }, + { + "epoch": 0.3625337815569178, + "grad_norm": 0.12158203125, + "learning_rate": 0.00016384747328143556, + "loss": 0.0095, + "step": 1375 + }, + { + "epoch": 0.3638520862171248, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0001637155297532656, + "loss": 0.0224, + "step": 1380 + }, + { + "epoch": 0.3651703908773318, + "grad_norm": 0.01318359375, + "learning_rate": 0.00016358358622509567, + "loss": 0.0316, + "step": 1385 + }, + { + "epoch": 0.3664886955375387, + "grad_norm": 0.011962890625, + "learning_rate": 0.0001634516426969257, + "loss": 0.0051, + "step": 1390 + }, + { + "epoch": 0.3678070001977457, + "grad_norm": 0.00396728515625, + "learning_rate": 0.00016331969916875578, + "loss": 0.038, + "step": 1395 + }, + { + "epoch": 0.3691253048579527, + "grad_norm": 0.375, + "learning_rate": 0.00016318775564058585, + "loss": 0.029, + "step": 1400 + }, + { + "epoch": 0.37044360951815963, + "grad_norm": 0.265625, + "learning_rate": 0.0001630558121124159, + "loss": 0.0072, + "step": 1405 + }, + { + "epoch": 0.3717619141783666, + "grad_norm": 0.00127410888671875, + "learning_rate": 0.00016292386858424596, + "loss": 0.0381, + "step": 1410 + }, + { + "epoch": 0.3730802188385736, + "grad_norm": 1.15625, + "learning_rate": 0.000162791925056076, + "loss": 0.0573, + "step": 1415 + }, + { + "epoch": 0.37439852349878056, + "grad_norm": 0.0244140625, + "learning_rate": 0.00016265998152790607, + "loss": 0.051, + "step": 1420 + }, + { + "epoch": 0.37571682815898755, + "grad_norm": 0.0015106201171875, + "learning_rate": 0.00016252803799973612, + "loss": 0.0239, + "step": 1425 + }, + { + "epoch": 0.37703513281919454, + "grad_norm": 0.26953125, + "learning_rate": 0.00016239609447156618, + "loss": 0.0165, + "step": 1430 + }, + { + "epoch": 0.3783534374794015, + "grad_norm": 0.006134033203125, + "learning_rate": 0.00016226415094339625, + "loss": 0.0071, + "step": 1435 + }, + { + "epoch": 0.37967174213960847, + "grad_norm": 2.828125, + "learning_rate": 0.0001621322074152263, + "loss": 0.0272, + "step": 1440 + }, + { + "epoch": 0.38099004679981546, + "grad_norm": 0.349609375, + "learning_rate": 0.00016200026388705637, + "loss": 0.0647, + "step": 1445 + }, + { + "epoch": 0.3823083514600224, + "grad_norm": 0.09326171875, + "learning_rate": 0.00016186832035888638, + "loss": 0.0262, + "step": 1450 + }, + { + "epoch": 0.3836266561202294, + "grad_norm": 0.041015625, + "learning_rate": 0.00016173637683071645, + "loss": 0.0576, + "step": 1455 + }, + { + "epoch": 0.3849449607804364, + "grad_norm": 0.033935546875, + "learning_rate": 0.00016160443330254652, + "loss": 0.0142, + "step": 1460 + }, + { + "epoch": 0.3862632654406433, + "grad_norm": 0.09130859375, + "learning_rate": 0.00016147248977437656, + "loss": 0.0348, + "step": 1465 + }, + { + "epoch": 0.3875815701008503, + "grad_norm": 2.390625, + "learning_rate": 0.00016134054624620663, + "loss": 0.0672, + "step": 1470 + }, + { + "epoch": 0.3888998747610573, + "grad_norm": 0.439453125, + "learning_rate": 0.00016120860271803667, + "loss": 0.0121, + "step": 1475 + }, + { + "epoch": 0.39021817942126424, + "grad_norm": 0.1298828125, + "learning_rate": 0.00016107665918986674, + "loss": 0.0114, + "step": 1480 + }, + { + "epoch": 0.39153648408147124, + "grad_norm": 0.85546875, + "learning_rate": 0.0001609447156616968, + "loss": 0.0968, + "step": 1485 + }, + { + "epoch": 0.39285478874167823, + "grad_norm": 0.703125, + "learning_rate": 0.00016081277213352685, + "loss": 0.0349, + "step": 1490 + }, + { + "epoch": 0.39417309340188517, + "grad_norm": 0.021728515625, + "learning_rate": 0.00016068082860535692, + "loss": 0.0106, + "step": 1495 + }, + { + "epoch": 0.39549139806209216, + "grad_norm": 0.7265625, + "learning_rate": 0.00016054888507718696, + "loss": 0.0225, + "step": 1500 + }, + { + "epoch": 0.39549139806209216, + "eval_loss": 0.03515048325061798, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.3497, + "eval_samples_per_second": 7.373, + "eval_steps_per_second": 3.686, + "step": 1500 + }, + { + "epoch": 0.3968097027222991, + "grad_norm": 0.016519820317626, + "learning_rate": 0.00016041694154901703, + "loss": 0.0202, + "step": 1505 + }, + { + "epoch": 0.3981280073825061, + "grad_norm": 0.8505942225456238, + "learning_rate": 0.00016028499802084708, + "loss": 0.0541, + "step": 1510 + }, + { + "epoch": 0.3994463120427131, + "grad_norm": 0.04163295030593872, + "learning_rate": 0.00016015305449267714, + "loss": 0.0037, + "step": 1515 + }, + { + "epoch": 0.40076461670292, + "grad_norm": 0.011332935653626919, + "learning_rate": 0.00016002111096450721, + "loss": 0.0459, + "step": 1520 + }, + { + "epoch": 0.402082921363127, + "grad_norm": 0.9360129833221436, + "learning_rate": 0.00015988916743633726, + "loss": 0.013, + "step": 1525 + }, + { + "epoch": 0.403401226023334, + "grad_norm": 0.11991436779499054, + "learning_rate": 0.00015975722390816733, + "loss": 0.0079, + "step": 1530 + }, + { + "epoch": 0.40471953068354094, + "grad_norm": 0.36911076307296753, + "learning_rate": 0.00015962528037999737, + "loss": 0.0638, + "step": 1535 + }, + { + "epoch": 0.40603783534374793, + "grad_norm": 0.020278634503483772, + "learning_rate": 0.00015949333685182744, + "loss": 0.0217, + "step": 1540 + }, + { + "epoch": 0.4073561400039549, + "grad_norm": 0.14263059198856354, + "learning_rate": 0.0001593613933236575, + "loss": 0.0495, + "step": 1545 + }, + { + "epoch": 0.40867444466416186, + "grad_norm": 0.09494803845882416, + "learning_rate": 0.00015922944979548752, + "loss": 0.0248, + "step": 1550 + }, + { + "epoch": 0.40999274932436885, + "grad_norm": 0.23064319789409637, + "learning_rate": 0.0001590975062673176, + "loss": 0.0285, + "step": 1555 + }, + { + "epoch": 0.41131105398457585, + "grad_norm": 0.32220256328582764, + "learning_rate": 0.00015896556273914763, + "loss": 0.0537, + "step": 1560 + }, + { + "epoch": 0.4126293586447828, + "grad_norm": 0.41208815574645996, + "learning_rate": 0.0001588336192109777, + "loss": 0.0453, + "step": 1565 + }, + { + "epoch": 0.4139476633049898, + "grad_norm": 0.03775424137711525, + "learning_rate": 0.00015870167568280777, + "loss": 0.0134, + "step": 1570 + }, + { + "epoch": 0.41526596796519677, + "grad_norm": 0.6526333093643188, + "learning_rate": 0.0001585697321546378, + "loss": 0.0329, + "step": 1575 + }, + { + "epoch": 0.4165842726254037, + "grad_norm": 1.001305103302002, + "learning_rate": 0.00015843778862646788, + "loss": 0.0912, + "step": 1580 + }, + { + "epoch": 0.4179025772856107, + "grad_norm": 0.4055219888687134, + "learning_rate": 0.00015830584509829792, + "loss": 0.0519, + "step": 1585 + }, + { + "epoch": 0.4192208819458177, + "grad_norm": 0.035015616565942764, + "learning_rate": 0.000158173901570128, + "loss": 0.0191, + "step": 1590 + }, + { + "epoch": 0.42053918660602463, + "grad_norm": 0.09326844662427902, + "learning_rate": 0.00015804195804195806, + "loss": 0.0106, + "step": 1595 + }, + { + "epoch": 0.4218574912662316, + "grad_norm": 0.06223440542817116, + "learning_rate": 0.0001579100145137881, + "loss": 0.0113, + "step": 1600 + }, + { + "epoch": 0.4231757959264386, + "grad_norm": 0.0625135526061058, + "learning_rate": 0.00015777807098561817, + "loss": 0.0191, + "step": 1605 + }, + { + "epoch": 0.42449410058664555, + "grad_norm": 0.2645983099937439, + "learning_rate": 0.00015764612745744822, + "loss": 0.0829, + "step": 1610 + }, + { + "epoch": 0.42581240524685254, + "grad_norm": 0.009632415138185024, + "learning_rate": 0.00015751418392927829, + "loss": 0.0542, + "step": 1615 + }, + { + "epoch": 0.42713070990705954, + "grad_norm": 0.01979319378733635, + "learning_rate": 0.00015738224040110833, + "loss": 0.0517, + "step": 1620 + }, + { + "epoch": 0.4284490145672665, + "grad_norm": 0.3065454065799713, + "learning_rate": 0.0001572502968729384, + "loss": 0.0738, + "step": 1625 + }, + { + "epoch": 0.42976731922747347, + "grad_norm": 0.09581473469734192, + "learning_rate": 0.00015711835334476847, + "loss": 0.0571, + "step": 1630 + }, + { + "epoch": 0.43108562388768046, + "grad_norm": 0.23746591806411743, + "learning_rate": 0.0001569864098165985, + "loss": 0.0128, + "step": 1635 + }, + { + "epoch": 0.4324039285478874, + "grad_norm": 0.936278760433197, + "learning_rate": 0.00015685446628842858, + "loss": 0.0665, + "step": 1640 + }, + { + "epoch": 0.4337222332080944, + "grad_norm": 0.18487441539764404, + "learning_rate": 0.00015672252276025862, + "loss": 0.0527, + "step": 1645 + }, + { + "epoch": 0.4350405378683014, + "grad_norm": 0.6980624794960022, + "learning_rate": 0.00015659057923208866, + "loss": 0.0613, + "step": 1650 + }, + { + "epoch": 0.4363588425285083, + "grad_norm": 0.4696301221847534, + "learning_rate": 0.00015645863570391873, + "loss": 0.0569, + "step": 1655 + }, + { + "epoch": 0.4376771471887153, + "grad_norm": 0.15083105862140656, + "learning_rate": 0.00015632669217574877, + "loss": 0.0394, + "step": 1660 + }, + { + "epoch": 0.4389954518489223, + "grad_norm": 0.44701239466667175, + "learning_rate": 0.00015619474864757884, + "loss": 0.0494, + "step": 1665 + }, + { + "epoch": 0.44031375650912924, + "grad_norm": 0.07418403029441833, + "learning_rate": 0.00015606280511940888, + "loss": 0.0291, + "step": 1670 + }, + { + "epoch": 0.44163206116933623, + "grad_norm": 0.02311861515045166, + "learning_rate": 0.00015593086159123895, + "loss": 0.0304, + "step": 1675 + }, + { + "epoch": 0.4429503658295432, + "grad_norm": 0.4416038990020752, + "learning_rate": 0.00015579891806306902, + "loss": 0.0176, + "step": 1680 + }, + { + "epoch": 0.44426867048975016, + "grad_norm": 0.5124915242195129, + "learning_rate": 0.00015566697453489906, + "loss": 0.0454, + "step": 1685 + }, + { + "epoch": 0.44558697514995715, + "grad_norm": 0.3159286081790924, + "learning_rate": 0.00015553503100672913, + "loss": 0.047, + "step": 1690 + }, + { + "epoch": 0.44690527981016415, + "grad_norm": 0.032126396894454956, + "learning_rate": 0.00015540308747855918, + "loss": 0.0151, + "step": 1695 + }, + { + "epoch": 0.4482235844703711, + "grad_norm": 0.04663548618555069, + "learning_rate": 0.00015527114395038924, + "loss": 0.0375, + "step": 1700 + }, + { + "epoch": 0.4495418891305781, + "grad_norm": 0.013753900304436684, + "learning_rate": 0.0001551392004222193, + "loss": 0.0485, + "step": 1705 + }, + { + "epoch": 0.45086019379078507, + "grad_norm": 1.9952393770217896, + "learning_rate": 0.00015500725689404936, + "loss": 0.0625, + "step": 1710 + }, + { + "epoch": 0.452178498450992, + "grad_norm": 0.014283270575106144, + "learning_rate": 0.00015487531336587943, + "loss": 0.0037, + "step": 1715 + }, + { + "epoch": 0.453496803111199, + "grad_norm": 0.3897913098335266, + "learning_rate": 0.00015474336983770947, + "loss": 0.0304, + "step": 1720 + }, + { + "epoch": 0.454815107771406, + "grad_norm": 0.3730885684490204, + "learning_rate": 0.00015461142630953954, + "loss": 0.0115, + "step": 1725 + }, + { + "epoch": 0.45613341243161293, + "grad_norm": 0.035858724266290665, + "learning_rate": 0.00015447948278136958, + "loss": 0.0021, + "step": 1730 + }, + { + "epoch": 0.4574517170918199, + "grad_norm": 0.20589517056941986, + "learning_rate": 0.00015434753925319965, + "loss": 0.0132, + "step": 1735 + }, + { + "epoch": 0.4587700217520269, + "grad_norm": 0.004939342383295298, + "learning_rate": 0.00015421559572502972, + "loss": 0.0471, + "step": 1740 + }, + { + "epoch": 0.46008832641223385, + "grad_norm": 0.03493283689022064, + "learning_rate": 0.00015408365219685976, + "loss": 0.0062, + "step": 1745 + }, + { + "epoch": 0.46140663107244084, + "grad_norm": 0.045927103608846664, + "learning_rate": 0.0001539517086686898, + "loss": 0.0283, + "step": 1750 + }, + { + "epoch": 0.46272493573264784, + "grad_norm": 0.012629454955458641, + "learning_rate": 0.00015381976514051984, + "loss": 0.0133, + "step": 1755 + }, + { + "epoch": 0.46404324039285477, + "grad_norm": 0.8001697659492493, + "learning_rate": 0.0001536878216123499, + "loss": 0.0224, + "step": 1760 + }, + { + "epoch": 0.46536154505306176, + "grad_norm": 0.002036362886428833, + "learning_rate": 0.00015355587808417998, + "loss": 0.0066, + "step": 1765 + }, + { + "epoch": 0.46667984971326876, + "grad_norm": 1.0261330604553223, + "learning_rate": 0.00015342393455601002, + "loss": 0.191, + "step": 1770 + }, + { + "epoch": 0.4679981543734757, + "grad_norm": 0.3033429682254791, + "learning_rate": 0.0001532919910278401, + "loss": 0.0222, + "step": 1775 + }, + { + "epoch": 0.4693164590336827, + "grad_norm": 0.36911338567733765, + "learning_rate": 0.00015316004749967014, + "loss": 0.0363, + "step": 1780 + }, + { + "epoch": 0.4706347636938897, + "grad_norm": 0.0406811460852623, + "learning_rate": 0.0001530281039715002, + "loss": 0.0283, + "step": 1785 + }, + { + "epoch": 0.4719530683540966, + "grad_norm": 0.23334211111068726, + "learning_rate": 0.00015289616044333027, + "loss": 0.0274, + "step": 1790 + }, + { + "epoch": 0.4732713730143036, + "grad_norm": 0.013081169687211514, + "learning_rate": 0.00015276421691516032, + "loss": 0.0221, + "step": 1795 + }, + { + "epoch": 0.4745896776745106, + "grad_norm": 0.2480790615081787, + "learning_rate": 0.00015263227338699039, + "loss": 0.019, + "step": 1800 + }, + { + "epoch": 0.47590798233471754, + "grad_norm": 0.0373196005821228, + "learning_rate": 0.00015250032985882043, + "loss": 0.0292, + "step": 1805 + }, + { + "epoch": 0.47722628699492453, + "grad_norm": 0.004609994124621153, + "learning_rate": 0.0001523683863306505, + "loss": 0.0918, + "step": 1810 + }, + { + "epoch": 0.4785445916551315, + "grad_norm": 0.02370987832546234, + "learning_rate": 0.00015223644280248054, + "loss": 0.0462, + "step": 1815 + }, + { + "epoch": 0.47986289631533846, + "grad_norm": 0.05842221528291702, + "learning_rate": 0.0001521044992743106, + "loss": 0.0595, + "step": 1820 + }, + { + "epoch": 0.48118120097554545, + "grad_norm": 0.009685276076197624, + "learning_rate": 0.00015197255574614068, + "loss": 0.0074, + "step": 1825 + }, + { + "epoch": 0.48249950563575245, + "grad_norm": 0.8933250308036804, + "learning_rate": 0.00015184061221797072, + "loss": 0.0757, + "step": 1830 + }, + { + "epoch": 0.4838178102959594, + "grad_norm": 0.07075401395559311, + "learning_rate": 0.0001517086686898008, + "loss": 0.0226, + "step": 1835 + }, + { + "epoch": 0.4851361149561664, + "grad_norm": 0.732706606388092, + "learning_rate": 0.00015157672516163083, + "loss": 0.0161, + "step": 1840 + }, + { + "epoch": 0.48645441961637337, + "grad_norm": 1.1897023916244507, + "learning_rate": 0.0001514447816334609, + "loss": 0.0265, + "step": 1845 + }, + { + "epoch": 0.4877727242765803, + "grad_norm": 0.052572328597307205, + "learning_rate": 0.00015131283810529094, + "loss": 0.0094, + "step": 1850 + }, + { + "epoch": 0.4890910289367873, + "grad_norm": 0.08263898640871048, + "learning_rate": 0.00015118089457712098, + "loss": 0.0631, + "step": 1855 + }, + { + "epoch": 0.4904093335969943, + "grad_norm": 0.03225664421916008, + "learning_rate": 0.00015104895104895105, + "loss": 0.023, + "step": 1860 + }, + { + "epoch": 0.4917276382572012, + "grad_norm": 0.007935039699077606, + "learning_rate": 0.0001509170075207811, + "loss": 0.0039, + "step": 1865 + }, + { + "epoch": 0.4930459429174082, + "grad_norm": 0.00830796267837286, + "learning_rate": 0.00015078506399261116, + "loss": 0.007, + "step": 1870 + }, + { + "epoch": 0.4943642475776152, + "grad_norm": 0.08042234182357788, + "learning_rate": 0.00015065312046444123, + "loss": 0.0366, + "step": 1875 + }, + { + "epoch": 0.49568255223782215, + "grad_norm": 0.009092851541936398, + "learning_rate": 0.00015052117693627128, + "loss": 0.0107, + "step": 1880 + }, + { + "epoch": 0.49700085689802914, + "grad_norm": 0.2674141824245453, + "learning_rate": 0.00015038923340810135, + "loss": 0.0076, + "step": 1885 + }, + { + "epoch": 0.49831916155823613, + "grad_norm": 0.07694366574287415, + "learning_rate": 0.0001502572898799314, + "loss": 0.0252, + "step": 1890 + }, + { + "epoch": 0.49963746621844307, + "grad_norm": 0.5699467062950134, + "learning_rate": 0.00015012534635176146, + "loss": 0.0487, + "step": 1895 + }, + { + "epoch": 0.5009557708786501, + "grad_norm": 0.18800878524780273, + "learning_rate": 0.0001499934028235915, + "loss": 0.0183, + "step": 1900 + }, + { + "epoch": 0.5022740755388571, + "grad_norm": 0.019469989463686943, + "learning_rate": 0.00014986145929542157, + "loss": 0.0268, + "step": 1905 + }, + { + "epoch": 0.503592380199064, + "grad_norm": 0.01890506222844124, + "learning_rate": 0.00014972951576725164, + "loss": 0.0449, + "step": 1910 + }, + { + "epoch": 0.5049106848592709, + "grad_norm": 0.0006314461352303624, + "learning_rate": 0.00014959757223908168, + "loss": 0.0056, + "step": 1915 + }, + { + "epoch": 0.5062289895194779, + "grad_norm": 0.32654041051864624, + "learning_rate": 0.00014946562871091175, + "loss": 0.0256, + "step": 1920 + }, + { + "epoch": 0.5075472941796849, + "grad_norm": 0.7803483605384827, + "learning_rate": 0.0001493336851827418, + "loss": 0.0374, + "step": 1925 + }, + { + "epoch": 0.5088655988398919, + "grad_norm": 0.028441445901989937, + "learning_rate": 0.00014920174165457186, + "loss": 0.0161, + "step": 1930 + }, + { + "epoch": 0.5101839035000989, + "grad_norm": 0.028379200026392937, + "learning_rate": 0.00014906979812640193, + "loss": 0.0151, + "step": 1935 + }, + { + "epoch": 0.5115022081603059, + "grad_norm": 0.021159596741199493, + "learning_rate": 0.00014893785459823197, + "loss": 0.0303, + "step": 1940 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.24903325736522675, + "learning_rate": 0.000148805911070062, + "loss": 0.0076, + "step": 1945 + }, + { + "epoch": 0.5141388174807198, + "grad_norm": 0.007065301761031151, + "learning_rate": 0.00014867396754189206, + "loss": 0.022, + "step": 1950 + }, + { + "epoch": 0.5154571221409268, + "grad_norm": 0.004032329190522432, + "learning_rate": 0.00014854202401372212, + "loss": 0.0083, + "step": 1955 + }, + { + "epoch": 0.5167754268011338, + "grad_norm": 0.3045775592327118, + "learning_rate": 0.0001484100804855522, + "loss": 0.0113, + "step": 1960 + }, + { + "epoch": 0.5180937314613407, + "grad_norm": 0.36974939703941345, + "learning_rate": 0.00014827813695738224, + "loss": 0.0267, + "step": 1965 + }, + { + "epoch": 0.5194120361215477, + "grad_norm": 0.009729950688779354, + "learning_rate": 0.0001481461934292123, + "loss": 0.027, + "step": 1970 + }, + { + "epoch": 0.5207303407817546, + "grad_norm": 0.0013097926275804639, + "learning_rate": 0.00014801424990104235, + "loss": 0.003, + "step": 1975 + }, + { + "epoch": 0.5220486454419616, + "grad_norm": 0.0706263929605484, + "learning_rate": 0.00014788230637287242, + "loss": 0.0193, + "step": 1980 + }, + { + "epoch": 0.5233669501021686, + "grad_norm": 1.435702919960022, + "learning_rate": 0.00014775036284470249, + "loss": 0.0647, + "step": 1985 + }, + { + "epoch": 0.5246852547623756, + "grad_norm": 0.00661757867783308, + "learning_rate": 0.00014761841931653253, + "loss": 0.0373, + "step": 1990 + }, + { + "epoch": 0.5260035594225826, + "grad_norm": 0.12014541029930115, + "learning_rate": 0.0001474864757883626, + "loss": 0.0178, + "step": 1995 + }, + { + "epoch": 0.5273218640827896, + "grad_norm": 1.0549248456954956, + "learning_rate": 0.00014735453226019264, + "loss": 0.0191, + "step": 2000 + }, + { + "epoch": 0.5273218640827896, + "eval_loss": 0.037292081862688065, + "eval_runtime": 454.3033, + "eval_samples_per_second": 7.422, + "eval_steps_per_second": 3.711, + "step": 2000 + }, + { + "epoch": 0.5286401687429965, + "grad_norm": 0.47634151577949524, + "learning_rate": 0.0001472225887320227, + "loss": 0.0404, + "step": 2005 + }, + { + "epoch": 0.5299584734032035, + "grad_norm": 0.006752463988959789, + "learning_rate": 0.00014709064520385275, + "loss": 0.034, + "step": 2010 + }, + { + "epoch": 0.5312767780634104, + "grad_norm": 0.20780125260353088, + "learning_rate": 0.00014695870167568282, + "loss": 0.0421, + "step": 2015 + }, + { + "epoch": 0.5325950827236174, + "grad_norm": 0.010941066779196262, + "learning_rate": 0.0001468267581475129, + "loss": 0.0086, + "step": 2020 + }, + { + "epoch": 0.5339133873838244, + "grad_norm": 0.3439581096172333, + "learning_rate": 0.00014669481461934293, + "loss": 0.0187, + "step": 2025 + }, + { + "epoch": 0.5352316920440314, + "grad_norm": 0.14961636066436768, + "learning_rate": 0.000146562871091173, + "loss": 0.0504, + "step": 2030 + }, + { + "epoch": 0.5365499967042383, + "grad_norm": 0.0044641937129199505, + "learning_rate": 0.00014643092756300304, + "loss": 0.0134, + "step": 2035 + }, + { + "epoch": 0.5378683013644453, + "grad_norm": 0.14088386297225952, + "learning_rate": 0.0001462989840348331, + "loss": 0.0096, + "step": 2040 + }, + { + "epoch": 0.5391866060246523, + "grad_norm": 0.48116979002952576, + "learning_rate": 0.00014616704050666315, + "loss": 0.0124, + "step": 2045 + }, + { + "epoch": 0.5405049106848593, + "grad_norm": 0.3688766360282898, + "learning_rate": 0.0001460350969784932, + "loss": 0.0226, + "step": 2050 + }, + { + "epoch": 0.5418232153450663, + "grad_norm": 0.002938181860372424, + "learning_rate": 0.00014590315345032326, + "loss": 0.0267, + "step": 2055 + }, + { + "epoch": 0.5431415200052733, + "grad_norm": 0.3335214853286743, + "learning_rate": 0.0001457712099221533, + "loss": 0.0367, + "step": 2060 + }, + { + "epoch": 0.5444598246654802, + "grad_norm": 0.004644686821848154, + "learning_rate": 0.00014563926639398338, + "loss": 0.0121, + "step": 2065 + }, + { + "epoch": 0.5457781293256871, + "grad_norm": 0.19505545496940613, + "learning_rate": 0.00014550732286581345, + "loss": 0.0591, + "step": 2070 + }, + { + "epoch": 0.5470964339858941, + "grad_norm": 0.018028756603598595, + "learning_rate": 0.0001453753793376435, + "loss": 0.0131, + "step": 2075 + }, + { + "epoch": 0.5484147386461011, + "grad_norm": 0.045639291405677795, + "learning_rate": 0.00014524343580947356, + "loss": 0.0443, + "step": 2080 + }, + { + "epoch": 0.5497330433063081, + "grad_norm": 0.727981686592102, + "learning_rate": 0.0001451114922813036, + "loss": 0.0205, + "step": 2085 + }, + { + "epoch": 0.5510513479665151, + "grad_norm": 0.03766491636633873, + "learning_rate": 0.00014497954875313367, + "loss": 0.0067, + "step": 2090 + }, + { + "epoch": 0.552369652626722, + "grad_norm": 0.1911504715681076, + "learning_rate": 0.0001448476052249637, + "loss": 0.0397, + "step": 2095 + }, + { + "epoch": 0.553687957286929, + "grad_norm": 0.08238353580236435, + "learning_rate": 0.00014471566169679378, + "loss": 0.0513, + "step": 2100 + }, + { + "epoch": 0.555006261947136, + "grad_norm": 0.06317206472158432, + "learning_rate": 0.00014458371816862385, + "loss": 0.0178, + "step": 2105 + }, + { + "epoch": 0.556324566607343, + "grad_norm": 0.0652734637260437, + "learning_rate": 0.0001444517746404539, + "loss": 0.0184, + "step": 2110 + }, + { + "epoch": 0.55764287126755, + "grad_norm": 0.05471858009696007, + "learning_rate": 0.00014431983111228396, + "loss": 0.0089, + "step": 2115 + }, + { + "epoch": 0.558961175927757, + "grad_norm": 0.005062670446932316, + "learning_rate": 0.000144187887584114, + "loss": 0.0052, + "step": 2120 + }, + { + "epoch": 0.5602794805879638, + "grad_norm": 0.06337414681911469, + "learning_rate": 0.00014405594405594407, + "loss": 0.053, + "step": 2125 + }, + { + "epoch": 0.5615977852481708, + "grad_norm": 0.33745357394218445, + "learning_rate": 0.00014392400052777414, + "loss": 0.0166, + "step": 2130 + }, + { + "epoch": 0.5629160899083778, + "grad_norm": 0.7382741570472717, + "learning_rate": 0.00014379205699960418, + "loss": 0.0191, + "step": 2135 + }, + { + "epoch": 0.5642343945685848, + "grad_norm": 0.007551972754299641, + "learning_rate": 0.00014366011347143425, + "loss": 0.0022, + "step": 2140 + }, + { + "epoch": 0.5655526992287918, + "grad_norm": 0.6260896921157837, + "learning_rate": 0.00014352816994326427, + "loss": 0.0095, + "step": 2145 + }, + { + "epoch": 0.5668710038889987, + "grad_norm": 0.11619322001934052, + "learning_rate": 0.00014339622641509434, + "loss": 0.015, + "step": 2150 + }, + { + "epoch": 0.5681893085492057, + "grad_norm": 1.1440670490264893, + "learning_rate": 0.0001432642828869244, + "loss": 0.1343, + "step": 2155 + }, + { + "epoch": 0.5695076132094127, + "grad_norm": 1.1793878078460693, + "learning_rate": 0.00014313233935875445, + "loss": 0.0968, + "step": 2160 + }, + { + "epoch": 0.5708259178696197, + "grad_norm": 0.6865736842155457, + "learning_rate": 0.00014300039583058452, + "loss": 0.0195, + "step": 2165 + }, + { + "epoch": 0.5721442225298267, + "grad_norm": 0.140816792845726, + "learning_rate": 0.00014286845230241456, + "loss": 0.0761, + "step": 2170 + }, + { + "epoch": 0.5734625271900337, + "grad_norm": 0.04071786254644394, + "learning_rate": 0.00014273650877424463, + "loss": 0.0193, + "step": 2175 + }, + { + "epoch": 0.5747808318502405, + "grad_norm": 0.044617727398872375, + "learning_rate": 0.0001426045652460747, + "loss": 0.0112, + "step": 2180 + }, + { + "epoch": 0.5760991365104475, + "grad_norm": 0.11001799255609512, + "learning_rate": 0.00014247262171790474, + "loss": 0.0039, + "step": 2185 + }, + { + "epoch": 0.5774174411706545, + "grad_norm": 0.0036315324250608683, + "learning_rate": 0.0001423406781897348, + "loss": 0.0038, + "step": 2190 + }, + { + "epoch": 0.5787357458308615, + "grad_norm": 0.9866570830345154, + "learning_rate": 0.00014220873466156485, + "loss": 0.025, + "step": 2195 + }, + { + "epoch": 0.5800540504910685, + "grad_norm": 0.023570384830236435, + "learning_rate": 0.00014207679113339492, + "loss": 0.0468, + "step": 2200 + }, + { + "epoch": 0.5813723551512755, + "grad_norm": 0.20010559260845184, + "learning_rate": 0.00014194484760522496, + "loss": 0.0198, + "step": 2205 + }, + { + "epoch": 0.5826906598114824, + "grad_norm": 0.06153270602226257, + "learning_rate": 0.00014181290407705503, + "loss": 0.0764, + "step": 2210 + }, + { + "epoch": 0.5840089644716894, + "grad_norm": 0.033162448555231094, + "learning_rate": 0.0001416809605488851, + "loss": 0.028, + "step": 2215 + }, + { + "epoch": 0.5853272691318964, + "grad_norm": 0.428382933139801, + "learning_rate": 0.00014154901702071514, + "loss": 0.0652, + "step": 2220 + }, + { + "epoch": 0.5866455737921034, + "grad_norm": 0.25004762411117554, + "learning_rate": 0.0001414170734925452, + "loss": 0.0411, + "step": 2225 + }, + { + "epoch": 0.5879638784523104, + "grad_norm": 0.22649863362312317, + "learning_rate": 0.00014128512996437525, + "loss": 0.0517, + "step": 2230 + }, + { + "epoch": 0.5892821831125173, + "grad_norm": 0.035932112485170364, + "learning_rate": 0.00014115318643620532, + "loss": 0.015, + "step": 2235 + }, + { + "epoch": 0.5906004877727242, + "grad_norm": 0.3800172507762909, + "learning_rate": 0.00014102124290803536, + "loss": 0.0324, + "step": 2240 + }, + { + "epoch": 0.5919187924329312, + "grad_norm": 0.6974118947982788, + "learning_rate": 0.0001408892993798654, + "loss": 0.0216, + "step": 2245 + }, + { + "epoch": 0.5932370970931382, + "grad_norm": 0.15472032129764557, + "learning_rate": 0.00014075735585169548, + "loss": 0.0164, + "step": 2250 + }, + { + "epoch": 0.5945554017533452, + "grad_norm": 0.015000814571976662, + "learning_rate": 0.00014062541232352552, + "loss": 0.0395, + "step": 2255 + }, + { + "epoch": 0.5958737064135522, + "grad_norm": 0.052086081355810165, + "learning_rate": 0.0001404934687953556, + "loss": 0.0032, + "step": 2260 + }, + { + "epoch": 0.5971920110737592, + "grad_norm": 0.004600350745022297, + "learning_rate": 0.00014036152526718566, + "loss": 0.0056, + "step": 2265 + }, + { + "epoch": 0.5985103157339661, + "grad_norm": 0.4940958321094513, + "learning_rate": 0.0001402295817390157, + "loss": 0.0206, + "step": 2270 + }, + { + "epoch": 0.5998286203941731, + "grad_norm": 0.09658394008874893, + "learning_rate": 0.00014009763821084577, + "loss": 0.0052, + "step": 2275 + }, + { + "epoch": 0.60114692505438, + "grad_norm": 0.00020539117394946516, + "learning_rate": 0.0001399656946826758, + "loss": 0.087, + "step": 2280 + }, + { + "epoch": 0.602465229714587, + "grad_norm": 0.1871018409729004, + "learning_rate": 0.00013983375115450588, + "loss": 0.0812, + "step": 2285 + }, + { + "epoch": 0.603783534374794, + "grad_norm": 0.02583954855799675, + "learning_rate": 0.00013970180762633592, + "loss": 0.0232, + "step": 2290 + }, + { + "epoch": 0.605101839035001, + "grad_norm": 1.2103784084320068, + "learning_rate": 0.000139569864098166, + "loss": 0.0151, + "step": 2295 + }, + { + "epoch": 0.6064201436952079, + "grad_norm": 0.023514943197369576, + "learning_rate": 0.00013943792056999606, + "loss": 0.0193, + "step": 2300 + }, + { + "epoch": 0.6077384483554149, + "grad_norm": 0.0076395305804908276, + "learning_rate": 0.0001393059770418261, + "loss": 0.0379, + "step": 2305 + }, + { + "epoch": 0.6090567530156219, + "grad_norm": 0.12412039190530777, + "learning_rate": 0.00013917403351365617, + "loss": 0.0095, + "step": 2310 + }, + { + "epoch": 0.6103750576758289, + "grad_norm": 0.021904783323407173, + "learning_rate": 0.0001390420899854862, + "loss": 0.0166, + "step": 2315 + }, + { + "epoch": 0.6116933623360359, + "grad_norm": 0.004012851510196924, + "learning_rate": 0.00013891014645731628, + "loss": 0.0103, + "step": 2320 + }, + { + "epoch": 0.6130116669962429, + "grad_norm": 0.007267913781106472, + "learning_rate": 0.00013877820292914635, + "loss": 0.0708, + "step": 2325 + }, + { + "epoch": 0.6143299716564498, + "grad_norm": 0.10363642126321793, + "learning_rate": 0.0001386462594009764, + "loss": 0.0473, + "step": 2330 + }, + { + "epoch": 0.6156482763166568, + "grad_norm": 0.04899830371141434, + "learning_rate": 0.00013851431587280646, + "loss": 0.0283, + "step": 2335 + }, + { + "epoch": 0.6169665809768637, + "grad_norm": 0.39460498094558716, + "learning_rate": 0.0001383823723446365, + "loss": 0.0597, + "step": 2340 + }, + { + "epoch": 0.6182848856370707, + "grad_norm": 0.04092290997505188, + "learning_rate": 0.00013825042881646655, + "loss": 0.0167, + "step": 2345 + }, + { + "epoch": 0.6196031902972777, + "grad_norm": 0.2781132161617279, + "learning_rate": 0.00013811848528829662, + "loss": 0.0097, + "step": 2350 + }, + { + "epoch": 0.6209214949574847, + "grad_norm": 0.041443537920713425, + "learning_rate": 0.00013798654176012666, + "loss": 0.0226, + "step": 2355 + }, + { + "epoch": 0.6222397996176916, + "grad_norm": 0.1242462694644928, + "learning_rate": 0.00013785459823195673, + "loss": 0.0055, + "step": 2360 + }, + { + "epoch": 0.6235581042778986, + "grad_norm": 0.4440467357635498, + "learning_rate": 0.00013772265470378677, + "loss": 0.049, + "step": 2365 + }, + { + "epoch": 0.6248764089381056, + "grad_norm": 0.014354427345097065, + "learning_rate": 0.00013759071117561684, + "loss": 0.0327, + "step": 2370 + }, + { + "epoch": 0.6261947135983126, + "grad_norm": 0.011539973318576813, + "learning_rate": 0.0001374587676474469, + "loss": 0.0222, + "step": 2375 + }, + { + "epoch": 0.6275130182585196, + "grad_norm": 0.23539051413536072, + "learning_rate": 0.00013732682411927695, + "loss": 0.0816, + "step": 2380 + }, + { + "epoch": 0.6288313229187266, + "grad_norm": 0.26793941855430603, + "learning_rate": 0.00013719488059110702, + "loss": 0.0325, + "step": 2385 + }, + { + "epoch": 0.6301496275789334, + "grad_norm": 0.01662217453122139, + "learning_rate": 0.00013706293706293706, + "loss": 0.0221, + "step": 2390 + }, + { + "epoch": 0.6314679322391404, + "grad_norm": 0.30669671297073364, + "learning_rate": 0.00013693099353476713, + "loss": 0.026, + "step": 2395 + }, + { + "epoch": 0.6327862368993474, + "grad_norm": 0.03350894898176193, + "learning_rate": 0.00013679905000659717, + "loss": 0.0072, + "step": 2400 + }, + { + "epoch": 0.6341045415595544, + "grad_norm": 0.014983875676989555, + "learning_rate": 0.00013666710647842724, + "loss": 0.049, + "step": 2405 + }, + { + "epoch": 0.6354228462197614, + "grad_norm": 1.8989384174346924, + "learning_rate": 0.0001365351629502573, + "loss": 0.0335, + "step": 2410 + }, + { + "epoch": 0.6367411508799684, + "grad_norm": 0.030135562643408775, + "learning_rate": 0.00013640321942208735, + "loss": 0.0051, + "step": 2415 + }, + { + "epoch": 0.6380594555401753, + "grad_norm": 0.02079075388610363, + "learning_rate": 0.00013627127589391742, + "loss": 0.0138, + "step": 2420 + }, + { + "epoch": 0.6393777602003823, + "grad_norm": 0.06065403297543526, + "learning_rate": 0.00013613933236574746, + "loss": 0.0357, + "step": 2425 + }, + { + "epoch": 0.6406960648605893, + "grad_norm": 0.2980937659740448, + "learning_rate": 0.00013600738883757753, + "loss": 0.0138, + "step": 2430 + }, + { + "epoch": 0.6420143695207963, + "grad_norm": 0.4820438623428345, + "learning_rate": 0.00013587544530940758, + "loss": 0.01, + "step": 2435 + }, + { + "epoch": 0.6433326741810033, + "grad_norm": 0.005618259310722351, + "learning_rate": 0.00013574350178123765, + "loss": 0.0052, + "step": 2440 + }, + { + "epoch": 0.6446509788412103, + "grad_norm": 0.7173821926116943, + "learning_rate": 0.0001356115582530677, + "loss": 0.0133, + "step": 2445 + }, + { + "epoch": 0.6459692835014171, + "grad_norm": 0.0053142281249165535, + "learning_rate": 0.00013547961472489773, + "loss": 0.0045, + "step": 2450 + }, + { + "epoch": 0.6472875881616241, + "grad_norm": 0.06118829548358917, + "learning_rate": 0.0001353476711967278, + "loss": 0.056, + "step": 2455 + }, + { + "epoch": 0.6486058928218311, + "grad_norm": 3.5878078937530518, + "learning_rate": 0.00013521572766855787, + "loss": 0.0232, + "step": 2460 + }, + { + "epoch": 0.6499241974820381, + "grad_norm": 0.004911276511847973, + "learning_rate": 0.0001350837841403879, + "loss": 0.0074, + "step": 2465 + }, + { + "epoch": 0.6512425021422451, + "grad_norm": 0.0028026222717016935, + "learning_rate": 0.00013495184061221798, + "loss": 0.0782, + "step": 2470 + }, + { + "epoch": 0.6525608068024521, + "grad_norm": 0.7317615747451782, + "learning_rate": 0.00013481989708404802, + "loss": 0.0222, + "step": 2475 + }, + { + "epoch": 0.653879111462659, + "grad_norm": 0.01835751160979271, + "learning_rate": 0.0001346879535558781, + "loss": 0.0661, + "step": 2480 + }, + { + "epoch": 0.655197416122866, + "grad_norm": 0.03598962351679802, + "learning_rate": 0.00013455601002770813, + "loss": 0.0395, + "step": 2485 + }, + { + "epoch": 0.656515720783073, + "grad_norm": 0.013886351138353348, + "learning_rate": 0.0001344240664995382, + "loss": 0.0156, + "step": 2490 + }, + { + "epoch": 0.65783402544328, + "grad_norm": 5.741530895233154, + "learning_rate": 0.00013429212297136827, + "loss": 0.0317, + "step": 2495 + }, + { + "epoch": 0.659152330103487, + "grad_norm": 0.20793496072292328, + "learning_rate": 0.0001341601794431983, + "loss": 0.0072, + "step": 2500 + }, + { + "epoch": 0.659152330103487, + "eval_loss": 0.0300898440182209, + "eval_runtime": 453.0554, + "eval_samples_per_second": 7.443, + "eval_steps_per_second": 3.721, + "step": 2500 + }, + { + "epoch": 0.6604706347636939, + "grad_norm": 0.03460961952805519, + "learning_rate": 0.00013402823591502838, + "loss": 0.0097, + "step": 2505 + }, + { + "epoch": 0.6617889394239008, + "grad_norm": 0.31785696744918823, + "learning_rate": 0.00013389629238685842, + "loss": 0.0303, + "step": 2510 + }, + { + "epoch": 0.6631072440841078, + "grad_norm": 0.4273851215839386, + "learning_rate": 0.0001337643488586885, + "loss": 0.0499, + "step": 2515 + }, + { + "epoch": 0.6644255487443148, + "grad_norm": 0.02236153744161129, + "learning_rate": 0.00013363240533051856, + "loss": 0.0069, + "step": 2520 + }, + { + "epoch": 0.6657438534045218, + "grad_norm": 0.1592864990234375, + "learning_rate": 0.0001335004618023486, + "loss": 0.0326, + "step": 2525 + }, + { + "epoch": 0.6670621580647288, + "grad_norm": 0.029961545020341873, + "learning_rate": 0.00013336851827417867, + "loss": 0.0178, + "step": 2530 + }, + { + "epoch": 0.6683804627249358, + "grad_norm": 0.03120764158666134, + "learning_rate": 0.00013323657474600872, + "loss": 0.115, + "step": 2535 + }, + { + "epoch": 0.6696987673851427, + "grad_norm": 0.01060028001666069, + "learning_rate": 0.00013310463121783879, + "loss": 0.0036, + "step": 2540 + }, + { + "epoch": 0.6710170720453497, + "grad_norm": 0.053470809012651443, + "learning_rate": 0.00013297268768966883, + "loss": 0.0079, + "step": 2545 + }, + { + "epoch": 0.6723353767055567, + "grad_norm": 0.022777097299695015, + "learning_rate": 0.00013284074416149887, + "loss": 0.0078, + "step": 2550 + }, + { + "epoch": 0.6736536813657636, + "grad_norm": 0.0548521913588047, + "learning_rate": 0.00013270880063332894, + "loss": 0.0503, + "step": 2555 + }, + { + "epoch": 0.6749719860259706, + "grad_norm": 0.02028457075357437, + "learning_rate": 0.00013257685710515898, + "loss": 0.0096, + "step": 2560 + }, + { + "epoch": 0.6762902906861776, + "grad_norm": 0.01569107361137867, + "learning_rate": 0.00013244491357698905, + "loss": 0.008, + "step": 2565 + }, + { + "epoch": 0.6776085953463845, + "grad_norm": 0.00743742985650897, + "learning_rate": 0.00013231297004881912, + "loss": 0.005, + "step": 2570 + }, + { + "epoch": 0.6789269000065915, + "grad_norm": 0.025164416059851646, + "learning_rate": 0.00013218102652064916, + "loss": 0.018, + "step": 2575 + }, + { + "epoch": 0.6802452046667985, + "grad_norm": 0.3653188645839691, + "learning_rate": 0.00013204908299247923, + "loss": 0.0295, + "step": 2580 + }, + { + "epoch": 0.6815635093270055, + "grad_norm": 0.685422956943512, + "learning_rate": 0.00013191713946430927, + "loss": 0.0335, + "step": 2585 + }, + { + "epoch": 0.6828818139872125, + "grad_norm": 0.675740122795105, + "learning_rate": 0.00013178519593613934, + "loss": 0.0592, + "step": 2590 + }, + { + "epoch": 0.6842001186474194, + "grad_norm": 0.10513252764940262, + "learning_rate": 0.00013165325240796938, + "loss": 0.0353, + "step": 2595 + }, + { + "epoch": 0.6855184233076264, + "grad_norm": 0.43512973189353943, + "learning_rate": 0.00013152130887979945, + "loss": 0.0142, + "step": 2600 + }, + { + "epoch": 0.6868367279678333, + "grad_norm": 0.029436839744448662, + "learning_rate": 0.00013138936535162952, + "loss": 0.0042, + "step": 2605 + }, + { + "epoch": 0.6881550326280403, + "grad_norm": 0.5607122778892517, + "learning_rate": 0.00013125742182345957, + "loss": 0.0184, + "step": 2610 + }, + { + "epoch": 0.6894733372882473, + "grad_norm": 0.11365406215190887, + "learning_rate": 0.00013112547829528963, + "loss": 0.006, + "step": 2615 + }, + { + "epoch": 0.6907916419484543, + "grad_norm": 0.047227244824171066, + "learning_rate": 0.00013099353476711968, + "loss": 0.008, + "step": 2620 + }, + { + "epoch": 0.6921099466086612, + "grad_norm": 0.0005877618095837533, + "learning_rate": 0.00013086159123894975, + "loss": 0.0286, + "step": 2625 + }, + { + "epoch": 0.6934282512688682, + "grad_norm": 0.010759112425148487, + "learning_rate": 0.0001307296477107798, + "loss": 0.0062, + "step": 2630 + }, + { + "epoch": 0.6947465559290752, + "grad_norm": 0.07117745280265808, + "learning_rate": 0.00013059770418260986, + "loss": 0.0891, + "step": 2635 + }, + { + "epoch": 0.6960648605892822, + "grad_norm": 0.0639057606458664, + "learning_rate": 0.00013046576065443993, + "loss": 0.0072, + "step": 2640 + }, + { + "epoch": 0.6973831652494892, + "grad_norm": 0.027350090444087982, + "learning_rate": 0.00013033381712626994, + "loss": 0.0103, + "step": 2645 + }, + { + "epoch": 0.6987014699096962, + "grad_norm": 0.015336195938289165, + "learning_rate": 0.0001302018735981, + "loss": 0.0041, + "step": 2650 + }, + { + "epoch": 0.700019774569903, + "grad_norm": 1.0650830268859863, + "learning_rate": 0.00013006993006993008, + "loss": 0.0443, + "step": 2655 + }, + { + "epoch": 0.70133807923011, + "grad_norm": 0.019073212519288063, + "learning_rate": 0.00012993798654176012, + "loss": 0.0331, + "step": 2660 + }, + { + "epoch": 0.702656383890317, + "grad_norm": 0.10109209269285202, + "learning_rate": 0.0001298060430135902, + "loss": 0.0054, + "step": 2665 + }, + { + "epoch": 0.703974688550524, + "grad_norm": 0.03528957813978195, + "learning_rate": 0.00012967409948542023, + "loss": 0.0427, + "step": 2670 + }, + { + "epoch": 0.705292993210731, + "grad_norm": 0.03577788919210434, + "learning_rate": 0.0001295421559572503, + "loss": 0.023, + "step": 2675 + }, + { + "epoch": 0.706611297870938, + "grad_norm": 0.5576180815696716, + "learning_rate": 0.00012941021242908034, + "loss": 0.0416, + "step": 2680 + }, + { + "epoch": 0.7079296025311449, + "grad_norm": 0.017131298780441284, + "learning_rate": 0.0001292782689009104, + "loss": 0.0235, + "step": 2685 + }, + { + "epoch": 0.7092479071913519, + "grad_norm": 0.8517888784408569, + "learning_rate": 0.00012914632537274048, + "loss": 0.0168, + "step": 2690 + }, + { + "epoch": 0.7105662118515589, + "grad_norm": 0.23812156915664673, + "learning_rate": 0.00012901438184457052, + "loss": 0.0483, + "step": 2695 + }, + { + "epoch": 0.7118845165117659, + "grad_norm": 0.11746613681316376, + "learning_rate": 0.0001288824383164006, + "loss": 0.0255, + "step": 2700 + }, + { + "epoch": 0.7132028211719729, + "grad_norm": 0.20089928805828094, + "learning_rate": 0.00012875049478823064, + "loss": 0.0267, + "step": 2705 + }, + { + "epoch": 0.7145211258321799, + "grad_norm": 0.8301129937171936, + "learning_rate": 0.0001286185512600607, + "loss": 0.016, + "step": 2710 + }, + { + "epoch": 0.7158394304923867, + "grad_norm": 0.01838674768805504, + "learning_rate": 0.00012848660773189077, + "loss": 0.0229, + "step": 2715 + }, + { + "epoch": 0.7171577351525937, + "grad_norm": 0.03670337051153183, + "learning_rate": 0.00012835466420372082, + "loss": 0.038, + "step": 2720 + }, + { + "epoch": 0.7184760398128007, + "grad_norm": 0.0452633760869503, + "learning_rate": 0.00012822272067555089, + "loss": 0.0622, + "step": 2725 + }, + { + "epoch": 0.7197943444730077, + "grad_norm": 0.09503110498189926, + "learning_rate": 0.00012809077714738093, + "loss": 0.0209, + "step": 2730 + }, + { + "epoch": 0.7211126491332147, + "grad_norm": 1.0327308177947998, + "learning_rate": 0.000127958833619211, + "loss": 0.0361, + "step": 2735 + }, + { + "epoch": 0.7224309537934217, + "grad_norm": 1.0049290657043457, + "learning_rate": 0.00012782689009104104, + "loss": 0.0365, + "step": 2740 + }, + { + "epoch": 0.7237492584536286, + "grad_norm": 0.029774073511362076, + "learning_rate": 0.00012769494656287108, + "loss": 0.0257, + "step": 2745 + }, + { + "epoch": 0.7250675631138356, + "grad_norm": 0.20974040031433105, + "learning_rate": 0.00012756300303470115, + "loss": 0.0542, + "step": 2750 + }, + { + "epoch": 0.7263858677740426, + "grad_norm": 0.8153854608535767, + "learning_rate": 0.0001274310595065312, + "loss": 0.0216, + "step": 2755 + }, + { + "epoch": 0.7277041724342496, + "grad_norm": 0.4393698573112488, + "learning_rate": 0.00012729911597836126, + "loss": 0.0451, + "step": 2760 + }, + { + "epoch": 0.7290224770944566, + "grad_norm": 0.06990349292755127, + "learning_rate": 0.00012716717245019133, + "loss": 0.03, + "step": 2765 + }, + { + "epoch": 0.7303407817546635, + "grad_norm": 0.32689470052719116, + "learning_rate": 0.00012703522892202137, + "loss": 0.0263, + "step": 2770 + }, + { + "epoch": 0.7316590864148704, + "grad_norm": 0.026600876823067665, + "learning_rate": 0.00012690328539385144, + "loss": 0.0404, + "step": 2775 + }, + { + "epoch": 0.7329773910750774, + "grad_norm": 0.11228257417678833, + "learning_rate": 0.00012677134186568148, + "loss": 0.0224, + "step": 2780 + }, + { + "epoch": 0.7342956957352844, + "grad_norm": 0.6469443440437317, + "learning_rate": 0.00012663939833751155, + "loss": 0.0178, + "step": 2785 + }, + { + "epoch": 0.7356140003954914, + "grad_norm": 0.020773250609636307, + "learning_rate": 0.0001265074548093416, + "loss": 0.011, + "step": 2790 + }, + { + "epoch": 0.7369323050556984, + "grad_norm": 0.7378728985786438, + "learning_rate": 0.00012637551128117167, + "loss": 0.0227, + "step": 2795 + }, + { + "epoch": 0.7382506097159054, + "grad_norm": 0.008189595304429531, + "learning_rate": 0.00012624356775300173, + "loss": 0.0892, + "step": 2800 + }, + { + "epoch": 0.7395689143761123, + "grad_norm": 0.031633853912353516, + "learning_rate": 0.00012611162422483178, + "loss": 0.0093, + "step": 2805 + }, + { + "epoch": 0.7408872190363193, + "grad_norm": 0.5078475475311279, + "learning_rate": 0.00012597968069666185, + "loss": 0.0567, + "step": 2810 + }, + { + "epoch": 0.7422055236965263, + "grad_norm": 0.21766887605190277, + "learning_rate": 0.0001258477371684919, + "loss": 0.0485, + "step": 2815 + }, + { + "epoch": 0.7435238283567333, + "grad_norm": 0.3029612898826599, + "learning_rate": 0.00012571579364032196, + "loss": 0.032, + "step": 2820 + }, + { + "epoch": 0.7448421330169402, + "grad_norm": 1.2135159969329834, + "learning_rate": 0.00012558385011215203, + "loss": 0.0139, + "step": 2825 + }, + { + "epoch": 0.7461604376771472, + "grad_norm": 0.016875172033905983, + "learning_rate": 0.00012545190658398207, + "loss": 0.0323, + "step": 2830 + }, + { + "epoch": 0.7474787423373541, + "grad_norm": 0.08923230320215225, + "learning_rate": 0.00012531996305581214, + "loss": 0.0343, + "step": 2835 + }, + { + "epoch": 0.7487970469975611, + "grad_norm": 0.2958766520023346, + "learning_rate": 0.00012518801952764215, + "loss": 0.0431, + "step": 2840 + }, + { + "epoch": 0.7501153516577681, + "grad_norm": 0.7344386577606201, + "learning_rate": 0.00012505607599947222, + "loss": 0.0389, + "step": 2845 + }, + { + "epoch": 0.7514336563179751, + "grad_norm": 0.03681635856628418, + "learning_rate": 0.0001249241324713023, + "loss": 0.0258, + "step": 2850 + }, + { + "epoch": 0.7527519609781821, + "grad_norm": 0.22866861522197723, + "learning_rate": 0.00012479218894313233, + "loss": 0.0223, + "step": 2855 + }, + { + "epoch": 0.7540702656383891, + "grad_norm": 0.029770435765385628, + "learning_rate": 0.0001246602454149624, + "loss": 0.0205, + "step": 2860 + }, + { + "epoch": 0.755388570298596, + "grad_norm": 0.011845707893371582, + "learning_rate": 0.00012452830188679244, + "loss": 0.0252, + "step": 2865 + }, + { + "epoch": 0.756706874958803, + "grad_norm": 0.06696149706840515, + "learning_rate": 0.00012439635835862251, + "loss": 0.0166, + "step": 2870 + }, + { + "epoch": 0.75802517961901, + "grad_norm": 0.01653144136071205, + "learning_rate": 0.00012426441483045256, + "loss": 0.0487, + "step": 2875 + }, + { + "epoch": 0.7593434842792169, + "grad_norm": 0.031312476843595505, + "learning_rate": 0.00012413247130228263, + "loss": 0.0155, + "step": 2880 + }, + { + "epoch": 0.7606617889394239, + "grad_norm": 0.011625733226537704, + "learning_rate": 0.0001240005277741127, + "loss": 0.0333, + "step": 2885 + }, + { + "epoch": 0.7619800935996309, + "grad_norm": 0.012089414522051811, + "learning_rate": 0.00012386858424594274, + "loss": 0.003, + "step": 2890 + }, + { + "epoch": 0.7632983982598378, + "grad_norm": 0.3012307584285736, + "learning_rate": 0.0001237366407177728, + "loss": 0.0172, + "step": 2895 + }, + { + "epoch": 0.7646167029200448, + "grad_norm": 0.31575000286102295, + "learning_rate": 0.00012360469718960285, + "loss": 0.0409, + "step": 2900 + }, + { + "epoch": 0.7659350075802518, + "grad_norm": 0.009794364683330059, + "learning_rate": 0.00012347275366143292, + "loss": 0.0214, + "step": 2905 + }, + { + "epoch": 0.7672533122404588, + "grad_norm": 0.5973085165023804, + "learning_rate": 0.00012334081013326299, + "loss": 0.0245, + "step": 2910 + }, + { + "epoch": 0.7685716169006658, + "grad_norm": 0.019750040024518967, + "learning_rate": 0.00012320886660509303, + "loss": 0.0063, + "step": 2915 + }, + { + "epoch": 0.7698899215608728, + "grad_norm": 0.06402858346700668, + "learning_rate": 0.0001230769230769231, + "loss": 0.0444, + "step": 2920 + }, + { + "epoch": 0.7712082262210797, + "grad_norm": 0.02876671403646469, + "learning_rate": 0.00012294497954875314, + "loss": 0.0103, + "step": 2925 + }, + { + "epoch": 0.7725265308812866, + "grad_norm": 0.6962207555770874, + "learning_rate": 0.0001228130360205832, + "loss": 0.0318, + "step": 2930 + }, + { + "epoch": 0.7738448355414936, + "grad_norm": 0.006536522414535284, + "learning_rate": 0.00012268109249241325, + "loss": 0.0096, + "step": 2935 + }, + { + "epoch": 0.7751631402017006, + "grad_norm": 0.07097168266773224, + "learning_rate": 0.0001225491489642433, + "loss": 0.0174, + "step": 2940 + }, + { + "epoch": 0.7764814448619076, + "grad_norm": 0.042360126972198486, + "learning_rate": 0.00012241720543607336, + "loss": 0.0158, + "step": 2945 + }, + { + "epoch": 0.7777997495221146, + "grad_norm": 0.01159572321921587, + "learning_rate": 0.0001222852619079034, + "loss": 0.0265, + "step": 2950 + }, + { + "epoch": 0.7791180541823215, + "grad_norm": 0.38408163189888, + "learning_rate": 0.00012215331837973347, + "loss": 0.0233, + "step": 2955 + }, + { + "epoch": 0.7804363588425285, + "grad_norm": 0.15588605403900146, + "learning_rate": 0.00012202137485156353, + "loss": 0.0041, + "step": 2960 + }, + { + "epoch": 0.7817546635027355, + "grad_norm": 0.006892362609505653, + "learning_rate": 0.00012188943132339358, + "loss": 0.0026, + "step": 2965 + }, + { + "epoch": 0.7830729681629425, + "grad_norm": 0.030915727838873863, + "learning_rate": 0.00012175748779522364, + "loss": 0.0028, + "step": 2970 + }, + { + "epoch": 0.7843912728231495, + "grad_norm": 0.8151025772094727, + "learning_rate": 0.00012162554426705371, + "loss": 0.0429, + "step": 2975 + }, + { + "epoch": 0.7857095774833565, + "grad_norm": 0.6765475273132324, + "learning_rate": 0.00012149360073888377, + "loss": 0.0319, + "step": 2980 + }, + { + "epoch": 0.7870278821435633, + "grad_norm": 0.054469238966703415, + "learning_rate": 0.00012136165721071382, + "loss": 0.0413, + "step": 2985 + }, + { + "epoch": 0.7883461868037703, + "grad_norm": 0.045610666275024414, + "learning_rate": 0.00012122971368254388, + "loss": 0.0521, + "step": 2990 + }, + { + "epoch": 0.7896644914639773, + "grad_norm": 0.4222470223903656, + "learning_rate": 0.00012109777015437393, + "loss": 0.0846, + "step": 2995 + }, + { + "epoch": 0.7909827961241843, + "grad_norm": 0.0272397268563509, + "learning_rate": 0.00012096582662620399, + "loss": 0.0364, + "step": 3000 + }, + { + "epoch": 0.7909827961241843, + "eval_loss": 0.033312585204839706, + "eval_runtime": 452.2552, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 3.728, + "step": 3000 + } + ], + "logging_steps": 5, + "max_steps": 7584, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.421149366682501e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3000/training_args.bin b/checkpoint-3000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4c05e0585b1aef03c0cbe4c50207ce04940bd838 --- /dev/null +++ b/checkpoint-3000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79dfa687fdd0c9908ab6b63535817e7567b29b0b483ac228723218f6f5fdeec5 +size 5688 diff --git a/checkpoint-3500/README.md b/checkpoint-3500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e738b5eda9883f4a45efd9d37b8b8357ba758456 --- /dev/null +++ b/checkpoint-3500/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/Llama-3.2-3B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-3500/adapter_config.json b/checkpoint-3500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..852d02d4dc1204f4332be8b3363041a3d3e3feb3 --- /dev/null +++ b/checkpoint-3500/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/Llama-3.2-3B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "gate_proj", + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-3500/adapter_model.safetensors b/checkpoint-3500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f6c6ff167a303c0ce2346b60dfffb15904720140 --- /dev/null +++ b/checkpoint-3500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47845f406e6f0019a93febf797d0ffc5ac771d8378b2e912f0873545d52b3af5 +size 97307544 diff --git a/checkpoint-3500/optimizer.pt b/checkpoint-3500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..61d2b6a1e14ade1ad163a99ed519b110f5f4987c --- /dev/null +++ b/checkpoint-3500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b7a2e686ad5dc19a46d78ab4f95be14bbc7b04f90a7c4a34bad527b1f908e87 +size 50866370 diff --git a/checkpoint-3500/rng_state.pth b/checkpoint-3500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2dead01948d5231153bda20bf054a0b7f2ad6e7e --- /dev/null +++ b/checkpoint-3500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd648700cf16d48c46790a09971fd0550fa206decd3bba89fc5daf71df8e36e5 +size 14244 diff --git a/checkpoint-3500/scheduler.pt b/checkpoint-3500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..140b5a83da01ef1f9a8dfe00ef216842a30976af --- /dev/null +++ b/checkpoint-3500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5be3fc2cd1e171555bde1f0c8647fe20253e23bd12ba526b024d8b3efc68a40c +size 1064 diff --git a/checkpoint-3500/special_tokens_map.json b/checkpoint-3500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-3500/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-3500/tokenizer.json b/checkpoint-3500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-3500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-3500/tokenizer_config.json b/checkpoint-3500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfd835b0ef1033e89629af6e13ae45397e9fa2f8 --- /dev/null +++ b/checkpoint-3500/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 July 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-3500/trainer_state.json b/checkpoint-3500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bcdf6177a3c7730a1047a86db0508f12009f0f95 --- /dev/null +++ b/checkpoint-3500/trainer_state.json @@ -0,0 +1,4992 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9228132621448817, + "eval_steps": 500, + "global_step": 3500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001318304660206974, + "grad_norm": 4.59375, + "learning_rate": 0.0002, + "loss": 1.9624, + "step": 5 + }, + { + "epoch": 0.002636609320413948, + "grad_norm": 1.7421875, + "learning_rate": 0.00019986805647183008, + "loss": 0.6513, + "step": 10 + }, + { + "epoch": 0.003954913980620921, + "grad_norm": 1.84375, + "learning_rate": 0.00019973611294366012, + "loss": 0.1146, + "step": 15 + }, + { + "epoch": 0.005273218640827896, + "grad_norm": 1.3203125, + "learning_rate": 0.0001996041694154902, + "loss": 0.0529, + "step": 20 + }, + { + "epoch": 0.006591523301034869, + "grad_norm": 0.40234375, + "learning_rate": 0.00019947222588732023, + "loss": 0.1214, + "step": 25 + }, + { + "epoch": 0.007909827961241843, + "grad_norm": 1.5390625, + "learning_rate": 0.0001993402823591503, + "loss": 0.0919, + "step": 30 + }, + { + "epoch": 0.009228132621448816, + "grad_norm": 0.06201171875, + "learning_rate": 0.00019920833883098034, + "loss": 0.09, + "step": 35 + }, + { + "epoch": 0.010546437281655791, + "grad_norm": 1.53125, + "learning_rate": 0.0001990763953028104, + "loss": 0.1945, + "step": 40 + }, + { + "epoch": 0.011864741941862765, + "grad_norm": 0.2890625, + "learning_rate": 0.00019894445177464048, + "loss": 0.1259, + "step": 45 + }, + { + "epoch": 0.013183046602069738, + "grad_norm": 0.609375, + "learning_rate": 0.00019881250824647052, + "loss": 0.027, + "step": 50 + }, + { + "epoch": 0.014501351262276712, + "grad_norm": 0.369140625, + "learning_rate": 0.00019868056471830057, + "loss": 0.1068, + "step": 55 + }, + { + "epoch": 0.015819655922483685, + "grad_norm": 0.34765625, + "learning_rate": 0.00019854862119013064, + "loss": 0.0542, + "step": 60 + }, + { + "epoch": 0.01713796058269066, + "grad_norm": 0.055419921875, + "learning_rate": 0.00019841667766196068, + "loss": 0.0901, + "step": 65 + }, + { + "epoch": 0.018456265242897632, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00019828473413379075, + "loss": 0.0091, + "step": 70 + }, + { + "epoch": 0.019774569903104607, + "grad_norm": 0.0079345703125, + "learning_rate": 0.0001981527906056208, + "loss": 0.0744, + "step": 75 + }, + { + "epoch": 0.021092874563311582, + "grad_norm": 0.65234375, + "learning_rate": 0.00019802084707745086, + "loss": 0.1108, + "step": 80 + }, + { + "epoch": 0.022411179223518554, + "grad_norm": 0.50390625, + "learning_rate": 0.0001978889035492809, + "loss": 0.0446, + "step": 85 + }, + { + "epoch": 0.02372948388372553, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019775696002111097, + "loss": 0.0982, + "step": 90 + }, + { + "epoch": 0.0250477885439325, + "grad_norm": 0.490234375, + "learning_rate": 0.00019762501649294104, + "loss": 0.1035, + "step": 95 + }, + { + "epoch": 0.026366093204139476, + "grad_norm": 0.12158203125, + "learning_rate": 0.00019749307296477108, + "loss": 0.0401, + "step": 100 + }, + { + "epoch": 0.02768439786434645, + "grad_norm": 0.16015625, + "learning_rate": 0.00019736112943660115, + "loss": 0.0309, + "step": 105 + }, + { + "epoch": 0.029002702524553423, + "grad_norm": 1.359375, + "learning_rate": 0.0001972291859084312, + "loss": 0.1032, + "step": 110 + }, + { + "epoch": 0.0303210071847604, + "grad_norm": 0.52734375, + "learning_rate": 0.00019709724238026126, + "loss": 0.0811, + "step": 115 + }, + { + "epoch": 0.03163931184496737, + "grad_norm": 0.177734375, + "learning_rate": 0.00019696529885209133, + "loss": 0.0258, + "step": 120 + }, + { + "epoch": 0.03295761650517435, + "grad_norm": 0.234375, + "learning_rate": 0.00019683335532392137, + "loss": 0.0437, + "step": 125 + }, + { + "epoch": 0.03427592116538132, + "grad_norm": 1.3046875, + "learning_rate": 0.00019670141179575144, + "loss": 0.0967, + "step": 130 + }, + { + "epoch": 0.03559422582558829, + "grad_norm": 0.2734375, + "learning_rate": 0.00019656946826758148, + "loss": 0.0132, + "step": 135 + }, + { + "epoch": 0.036912530485795264, + "grad_norm": 0.66015625, + "learning_rate": 0.00019643752473941155, + "loss": 0.0396, + "step": 140 + }, + { + "epoch": 0.03823083514600224, + "grad_norm": 1.0546875, + "learning_rate": 0.0001963055812112416, + "loss": 0.0449, + "step": 145 + }, + { + "epoch": 0.039549139806209214, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019617363768307166, + "loss": 0.1196, + "step": 150 + }, + { + "epoch": 0.040867444466416186, + "grad_norm": 0.5859375, + "learning_rate": 0.0001960416941549017, + "loss": 0.0588, + "step": 155 + }, + { + "epoch": 0.042185749126623165, + "grad_norm": 0.06005859375, + "learning_rate": 0.00019590975062673175, + "loss": 0.0234, + "step": 160 + }, + { + "epoch": 0.04350405378683014, + "grad_norm": 0.4921875, + "learning_rate": 0.00019577780709856182, + "loss": 0.0916, + "step": 165 + }, + { + "epoch": 0.04482235844703711, + "grad_norm": 0.84375, + "learning_rate": 0.0001956458635703919, + "loss": 0.0271, + "step": 170 + }, + { + "epoch": 0.04614066310724409, + "grad_norm": 0.8828125, + "learning_rate": 0.00019551392004222193, + "loss": 0.0175, + "step": 175 + }, + { + "epoch": 0.04745896776745106, + "grad_norm": 0.0152587890625, + "learning_rate": 0.000195381976514052, + "loss": 0.0356, + "step": 180 + }, + { + "epoch": 0.04877727242765803, + "grad_norm": 0.09326171875, + "learning_rate": 0.00019525003298588204, + "loss": 0.0057, + "step": 185 + }, + { + "epoch": 0.050095577087865, + "grad_norm": 0.24609375, + "learning_rate": 0.0001951180894577121, + "loss": 0.0082, + "step": 190 + }, + { + "epoch": 0.05141388174807198, + "grad_norm": 0.05029296875, + "learning_rate": 0.00019498614592954215, + "loss": 0.0178, + "step": 195 + }, + { + "epoch": 0.05273218640827895, + "grad_norm": 0.0390625, + "learning_rate": 0.00019485420240137222, + "loss": 0.0789, + "step": 200 + }, + { + "epoch": 0.054050491068485924, + "grad_norm": 0.5625, + "learning_rate": 0.0001947222588732023, + "loss": 0.0645, + "step": 205 + }, + { + "epoch": 0.0553687957286929, + "grad_norm": 0.53515625, + "learning_rate": 0.00019459031534503233, + "loss": 0.116, + "step": 210 + }, + { + "epoch": 0.056687100388899875, + "grad_norm": 0.55078125, + "learning_rate": 0.0001944583718168624, + "loss": 0.0516, + "step": 215 + }, + { + "epoch": 0.058005405049106847, + "grad_norm": 0.314453125, + "learning_rate": 0.00019432642828869244, + "loss": 0.1019, + "step": 220 + }, + { + "epoch": 0.059323709709313825, + "grad_norm": 0.1123046875, + "learning_rate": 0.0001941944847605225, + "loss": 0.0529, + "step": 225 + }, + { + "epoch": 0.0606420143695208, + "grad_norm": 0.4921875, + "learning_rate": 0.00019406254123235256, + "loss": 0.0368, + "step": 230 + }, + { + "epoch": 0.06196031902972777, + "grad_norm": 0.054443359375, + "learning_rate": 0.00019393059770418262, + "loss": 0.037, + "step": 235 + }, + { + "epoch": 0.06327862368993474, + "grad_norm": 0.008544921875, + "learning_rate": 0.0001937986541760127, + "loss": 0.0324, + "step": 240 + }, + { + "epoch": 0.06459692835014172, + "grad_norm": 1.5, + "learning_rate": 0.00019366671064784274, + "loss": 0.0334, + "step": 245 + }, + { + "epoch": 0.0659152330103487, + "grad_norm": 0.2109375, + "learning_rate": 0.0001935347671196728, + "loss": 0.0671, + "step": 250 + }, + { + "epoch": 0.06723353767055566, + "grad_norm": 2.0625, + "learning_rate": 0.00019340282359150285, + "loss": 0.1559, + "step": 255 + }, + { + "epoch": 0.06855184233076264, + "grad_norm": 0.7734375, + "learning_rate": 0.0001932708800633329, + "loss": 0.0198, + "step": 260 + }, + { + "epoch": 0.06987014699096962, + "grad_norm": 0.42578125, + "learning_rate": 0.00019313893653516296, + "loss": 0.0151, + "step": 265 + }, + { + "epoch": 0.07118845165117658, + "grad_norm": 0.1884765625, + "learning_rate": 0.000193006993006993, + "loss": 0.0269, + "step": 270 + }, + { + "epoch": 0.07250675631138356, + "grad_norm": 1.546875, + "learning_rate": 0.00019287504947882307, + "loss": 0.0565, + "step": 275 + }, + { + "epoch": 0.07382506097159053, + "grad_norm": 0.5078125, + "learning_rate": 0.0001927431059506531, + "loss": 0.0942, + "step": 280 + }, + { + "epoch": 0.0751433656317975, + "grad_norm": 0.392578125, + "learning_rate": 0.00019261116242248318, + "loss": 0.0061, + "step": 285 + }, + { + "epoch": 0.07646167029200449, + "grad_norm": 1.9140625, + "learning_rate": 0.00019247921889431325, + "loss": 0.0497, + "step": 290 + }, + { + "epoch": 0.07777997495221145, + "grad_norm": 0.08837890625, + "learning_rate": 0.0001923472753661433, + "loss": 0.0573, + "step": 295 + }, + { + "epoch": 0.07909827961241843, + "grad_norm": 1.046875, + "learning_rate": 0.00019221533183797336, + "loss": 0.0528, + "step": 300 + }, + { + "epoch": 0.08041658427262541, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001920833883098034, + "loss": 0.0506, + "step": 305 + }, + { + "epoch": 0.08173488893283237, + "grad_norm": 0.08203125, + "learning_rate": 0.00019195144478163347, + "loss": 0.0307, + "step": 310 + }, + { + "epoch": 0.08305319359303935, + "grad_norm": 0.111328125, + "learning_rate": 0.00019181950125346354, + "loss": 0.0365, + "step": 315 + }, + { + "epoch": 0.08437149825324633, + "grad_norm": 1.2890625, + "learning_rate": 0.00019168755772529358, + "loss": 0.0447, + "step": 320 + }, + { + "epoch": 0.0856898029134533, + "grad_norm": 0.6015625, + "learning_rate": 0.00019155561419712365, + "loss": 0.0605, + "step": 325 + }, + { + "epoch": 0.08700810757366027, + "grad_norm": 0.71875, + "learning_rate": 0.0001914236706689537, + "loss": 0.0846, + "step": 330 + }, + { + "epoch": 0.08832641223386725, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019129172714078376, + "loss": 0.0713, + "step": 335 + }, + { + "epoch": 0.08964471689407422, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001911597836126138, + "loss": 0.0826, + "step": 340 + }, + { + "epoch": 0.0909630215542812, + "grad_norm": 2.203125, + "learning_rate": 0.00019102784008444388, + "loss": 0.0441, + "step": 345 + }, + { + "epoch": 0.09228132621448817, + "grad_norm": 1.21875, + "learning_rate": 0.00019089589655627395, + "loss": 0.1378, + "step": 350 + }, + { + "epoch": 0.09359963087469514, + "grad_norm": 3.0625, + "learning_rate": 0.00019076395302810396, + "loss": 0.1552, + "step": 355 + }, + { + "epoch": 0.09491793553490212, + "grad_norm": 0.232421875, + "learning_rate": 0.00019063200949993403, + "loss": 0.0458, + "step": 360 + }, + { + "epoch": 0.0962362401951091, + "grad_norm": 0.71875, + "learning_rate": 0.0001905000659717641, + "loss": 0.0312, + "step": 365 + }, + { + "epoch": 0.09755454485531606, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00019036812244359414, + "loss": 0.0247, + "step": 370 + }, + { + "epoch": 0.09887284951552304, + "grad_norm": 0.064453125, + "learning_rate": 0.0001902361789154242, + "loss": 0.054, + "step": 375 + }, + { + "epoch": 0.10019115417573, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019010423538725425, + "loss": 0.0023, + "step": 380 + }, + { + "epoch": 0.10150945883593698, + "grad_norm": 0.0361328125, + "learning_rate": 0.00018997229185908432, + "loss": 0.0884, + "step": 385 + }, + { + "epoch": 0.10282776349614396, + "grad_norm": 1.703125, + "learning_rate": 0.00018984034833091436, + "loss": 0.0506, + "step": 390 + }, + { + "epoch": 0.10414606815635093, + "grad_norm": 0.08837890625, + "learning_rate": 0.00018970840480274443, + "loss": 0.1123, + "step": 395 + }, + { + "epoch": 0.1054643728165579, + "grad_norm": 0.6953125, + "learning_rate": 0.0001895764612745745, + "loss": 0.0597, + "step": 400 + }, + { + "epoch": 0.10678267747676488, + "grad_norm": 0.18359375, + "learning_rate": 0.00018944451774640454, + "loss": 0.0138, + "step": 405 + }, + { + "epoch": 0.10810098213697185, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001893125742182346, + "loss": 0.0249, + "step": 410 + }, + { + "epoch": 0.10941928679717883, + "grad_norm": 0.00970458984375, + "learning_rate": 0.00018918063069006466, + "loss": 0.0084, + "step": 415 + }, + { + "epoch": 0.1107375914573858, + "grad_norm": 0.54296875, + "learning_rate": 0.00018904868716189472, + "loss": 0.0541, + "step": 420 + }, + { + "epoch": 0.11205589611759277, + "grad_norm": 0.74609375, + "learning_rate": 0.00018891674363372477, + "loss": 0.007, + "step": 425 + }, + { + "epoch": 0.11337420077779975, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00018878480010555484, + "loss": 0.0875, + "step": 430 + }, + { + "epoch": 0.11469250543800673, + "grad_norm": 0.9296875, + "learning_rate": 0.0001886528565773849, + "loss": 0.1207, + "step": 435 + }, + { + "epoch": 0.11601081009821369, + "grad_norm": 1.2734375, + "learning_rate": 0.00018852091304921495, + "loss": 0.1143, + "step": 440 + }, + { + "epoch": 0.11732911475842067, + "grad_norm": 0.6484375, + "learning_rate": 0.00018838896952104502, + "loss": 0.0393, + "step": 445 + }, + { + "epoch": 0.11864741941862765, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018825702599287506, + "loss": 0.02, + "step": 450 + }, + { + "epoch": 0.11996572407883462, + "grad_norm": 0.486328125, + "learning_rate": 0.0001881250824647051, + "loss": 0.0891, + "step": 455 + }, + { + "epoch": 0.1212840287390416, + "grad_norm": 1.0, + "learning_rate": 0.00018799313893653517, + "loss": 0.0469, + "step": 460 + }, + { + "epoch": 0.12260233339924857, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001878611954083652, + "loss": 0.019, + "step": 465 + }, + { + "epoch": 0.12392063805945554, + "grad_norm": 0.03857421875, + "learning_rate": 0.00018772925188019528, + "loss": 0.007, + "step": 470 + }, + { + "epoch": 0.12523894271966252, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00018759730835202532, + "loss": 0.0039, + "step": 475 + }, + { + "epoch": 0.12655724737986948, + "grad_norm": 0.014404296875, + "learning_rate": 0.0001874653648238554, + "loss": 0.0043, + "step": 480 + }, + { + "epoch": 0.12787555204007647, + "grad_norm": 0.51953125, + "learning_rate": 0.00018733342129568546, + "loss": 0.1326, + "step": 485 + }, + { + "epoch": 0.12919385670028344, + "grad_norm": 0.99609375, + "learning_rate": 0.0001872014777675155, + "loss": 0.0369, + "step": 490 + }, + { + "epoch": 0.1305121613604904, + "grad_norm": 0.2734375, + "learning_rate": 0.00018706953423934557, + "loss": 0.0395, + "step": 495 + }, + { + "epoch": 0.1318304660206974, + "grad_norm": 0.083984375, + "learning_rate": 0.00018693759071117561, + "loss": 0.0284, + "step": 500 + }, + { + "epoch": 0.1318304660206974, + "eval_loss": 0.04542969539761543, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.5293, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 3.685, + "step": 500 + }, + { + "epoch": 0.13314877068090436, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00018680564718300568, + "loss": 0.0533, + "step": 505 + }, + { + "epoch": 0.13446707534111133, + "grad_norm": 0.71484375, + "learning_rate": 0.00018667370365483575, + "loss": 0.0183, + "step": 510 + }, + { + "epoch": 0.13578538000131832, + "grad_norm": 0.018798828125, + "learning_rate": 0.0001865417601266658, + "loss": 0.0473, + "step": 515 + }, + { + "epoch": 0.13710368466152528, + "grad_norm": 0.388671875, + "learning_rate": 0.00018640981659849586, + "loss": 0.0562, + "step": 520 + }, + { + "epoch": 0.13842198932173225, + "grad_norm": 0.77734375, + "learning_rate": 0.0001862778730703259, + "loss": 0.0755, + "step": 525 + }, + { + "epoch": 0.13974029398193924, + "grad_norm": 2.8125, + "learning_rate": 0.00018614592954215598, + "loss": 0.0422, + "step": 530 + }, + { + "epoch": 0.1410585986421462, + "grad_norm": 0.48828125, + "learning_rate": 0.00018601398601398602, + "loss": 0.0882, + "step": 535 + }, + { + "epoch": 0.14237690330235317, + "grad_norm": 0.16015625, + "learning_rate": 0.0001858820424858161, + "loss": 0.0131, + "step": 540 + }, + { + "epoch": 0.14369520796256013, + "grad_norm": 0.31640625, + "learning_rate": 0.00018575009895764616, + "loss": 0.03, + "step": 545 + }, + { + "epoch": 0.14501351262276713, + "grad_norm": 0.0120849609375, + "learning_rate": 0.0001856181554294762, + "loss": 0.0425, + "step": 550 + }, + { + "epoch": 0.1463318172829741, + "grad_norm": 0.390625, + "learning_rate": 0.00018548621190130624, + "loss": 0.011, + "step": 555 + }, + { + "epoch": 0.14765012194318106, + "grad_norm": 1.9609375, + "learning_rate": 0.0001853542683731363, + "loss": 0.0807, + "step": 560 + }, + { + "epoch": 0.14896842660338805, + "grad_norm": 0.609375, + "learning_rate": 0.00018522232484496635, + "loss": 0.0278, + "step": 565 + }, + { + "epoch": 0.150286731263595, + "grad_norm": 0.087890625, + "learning_rate": 0.00018509038131679642, + "loss": 0.0484, + "step": 570 + }, + { + "epoch": 0.15160503592380198, + "grad_norm": 0.5078125, + "learning_rate": 0.00018495843778862646, + "loss": 0.1277, + "step": 575 + }, + { + "epoch": 0.15292334058400897, + "grad_norm": 0.8125, + "learning_rate": 0.00018482649426045653, + "loss": 0.058, + "step": 580 + }, + { + "epoch": 0.15424164524421594, + "grad_norm": 0.22265625, + "learning_rate": 0.00018469455073228657, + "loss": 0.0259, + "step": 585 + }, + { + "epoch": 0.1555599499044229, + "grad_norm": 1.8984375, + "learning_rate": 0.00018456260720411664, + "loss": 0.113, + "step": 590 + }, + { + "epoch": 0.1568782545646299, + "grad_norm": 0.12451171875, + "learning_rate": 0.0001844306636759467, + "loss": 0.0312, + "step": 595 + }, + { + "epoch": 0.15819655922483686, + "grad_norm": 0.0322265625, + "learning_rate": 0.00018429872014777676, + "loss": 0.0476, + "step": 600 + }, + { + "epoch": 0.15951486388504382, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00018416677661960682, + "loss": 0.0232, + "step": 605 + }, + { + "epoch": 0.16083316854525082, + "grad_norm": 0.57421875, + "learning_rate": 0.00018403483309143687, + "loss": 0.1287, + "step": 610 + }, + { + "epoch": 0.16215147320545778, + "grad_norm": 0.765625, + "learning_rate": 0.00018390288956326694, + "loss": 0.0991, + "step": 615 + }, + { + "epoch": 0.16346977786566474, + "grad_norm": 0.3125, + "learning_rate": 0.00018377094603509698, + "loss": 0.0247, + "step": 620 + }, + { + "epoch": 0.16478808252587174, + "grad_norm": 0.37890625, + "learning_rate": 0.00018363900250692705, + "loss": 0.0632, + "step": 625 + }, + { + "epoch": 0.1661063871860787, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018350705897875712, + "loss": 0.0314, + "step": 630 + }, + { + "epoch": 0.16742469184628567, + "grad_norm": 0.0673828125, + "learning_rate": 0.00018337511545058716, + "loss": 0.0425, + "step": 635 + }, + { + "epoch": 0.16874299650649266, + "grad_norm": 0.396484375, + "learning_rate": 0.00018324317192241723, + "loss": 0.0613, + "step": 640 + }, + { + "epoch": 0.17006130116669962, + "grad_norm": 0.057373046875, + "learning_rate": 0.00018311122839424727, + "loss": 0.0569, + "step": 645 + }, + { + "epoch": 0.1713796058269066, + "grad_norm": 0.001373291015625, + "learning_rate": 0.00018297928486607734, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 0.17269791048711358, + "grad_norm": 1.0859375, + "learning_rate": 0.00018284734133790738, + "loss": 0.0189, + "step": 655 + }, + { + "epoch": 0.17401621514732055, + "grad_norm": 0.6015625, + "learning_rate": 0.00018271539780973742, + "loss": 0.0601, + "step": 660 + }, + { + "epoch": 0.1753345198075275, + "grad_norm": 0.25390625, + "learning_rate": 0.0001825834542815675, + "loss": 0.0211, + "step": 665 + }, + { + "epoch": 0.1766528244677345, + "grad_norm": 2.6875, + "learning_rate": 0.00018245151075339753, + "loss": 0.0713, + "step": 670 + }, + { + "epoch": 0.17797112912794147, + "grad_norm": 1.1875, + "learning_rate": 0.0001823195672252276, + "loss": 0.0522, + "step": 675 + }, + { + "epoch": 0.17928943378814843, + "grad_norm": 0.025146484375, + "learning_rate": 0.00018218762369705767, + "loss": 0.0242, + "step": 680 + }, + { + "epoch": 0.18060773844835543, + "grad_norm": 0.048095703125, + "learning_rate": 0.00018205568016888772, + "loss": 0.0129, + "step": 685 + }, + { + "epoch": 0.1819260431085624, + "grad_norm": 0.04541015625, + "learning_rate": 0.00018192373664071778, + "loss": 0.0142, + "step": 690 + }, + { + "epoch": 0.18324434776876936, + "grad_norm": 0.00830078125, + "learning_rate": 0.00018179179311254783, + "loss": 0.0121, + "step": 695 + }, + { + "epoch": 0.18456265242897635, + "grad_norm": 0.53125, + "learning_rate": 0.0001816598495843779, + "loss": 0.0163, + "step": 700 + }, + { + "epoch": 0.1858809570891833, + "grad_norm": 0.185546875, + "learning_rate": 0.00018152790605620796, + "loss": 0.0203, + "step": 705 + }, + { + "epoch": 0.18719926174939028, + "grad_norm": 1.2578125, + "learning_rate": 0.000181395962528038, + "loss": 0.1548, + "step": 710 + }, + { + "epoch": 0.18851756640959727, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00018126401899986808, + "loss": 0.0543, + "step": 715 + }, + { + "epoch": 0.18983587106980424, + "grad_norm": 0.07568359375, + "learning_rate": 0.00018113207547169812, + "loss": 0.0346, + "step": 720 + }, + { + "epoch": 0.1911541757300112, + "grad_norm": 0.1318359375, + "learning_rate": 0.0001810001319435282, + "loss": 0.03, + "step": 725 + }, + { + "epoch": 0.1924724803902182, + "grad_norm": 0.1455078125, + "learning_rate": 0.00018086818841535823, + "loss": 0.0796, + "step": 730 + }, + { + "epoch": 0.19379078505042516, + "grad_norm": 0.09814453125, + "learning_rate": 0.0001807362448871883, + "loss": 0.0662, + "step": 735 + }, + { + "epoch": 0.19510908971063212, + "grad_norm": 0.91015625, + "learning_rate": 0.00018060430135901837, + "loss": 0.0675, + "step": 740 + }, + { + "epoch": 0.19642739437083911, + "grad_norm": 0.10693359375, + "learning_rate": 0.0001804723578308484, + "loss": 0.0377, + "step": 745 + }, + { + "epoch": 0.19774569903104608, + "grad_norm": 0.95703125, + "learning_rate": 0.00018034041430267848, + "loss": 0.0174, + "step": 750 + }, + { + "epoch": 0.19906400369125304, + "grad_norm": 1.7890625, + "learning_rate": 0.00018020847077450852, + "loss": 0.0278, + "step": 755 + }, + { + "epoch": 0.20038230835146, + "grad_norm": 0.8515625, + "learning_rate": 0.00018007652724633856, + "loss": 0.0113, + "step": 760 + }, + { + "epoch": 0.201700613011667, + "grad_norm": 0.016845703125, + "learning_rate": 0.00017994458371816863, + "loss": 0.0589, + "step": 765 + }, + { + "epoch": 0.20301891767187397, + "grad_norm": 0.01043701171875, + "learning_rate": 0.00017981264018999867, + "loss": 0.0203, + "step": 770 + }, + { + "epoch": 0.20433722233208093, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00017968069666182874, + "loss": 0.0494, + "step": 775 + }, + { + "epoch": 0.20565552699228792, + "grad_norm": 0.56640625, + "learning_rate": 0.00017954875313365879, + "loss": 0.0394, + "step": 780 + }, + { + "epoch": 0.2069738316524949, + "grad_norm": 0.06591796875, + "learning_rate": 0.00017941680960548886, + "loss": 0.0848, + "step": 785 + }, + { + "epoch": 0.20829213631270185, + "grad_norm": 0.40234375, + "learning_rate": 0.00017928486607731892, + "loss": 0.0464, + "step": 790 + }, + { + "epoch": 0.20961044097290885, + "grad_norm": 0.06298828125, + "learning_rate": 0.00017915292254914897, + "loss": 0.0222, + "step": 795 + }, + { + "epoch": 0.2109287456331158, + "grad_norm": 0.5390625, + "learning_rate": 0.00017902097902097904, + "loss": 0.0434, + "step": 800 + }, + { + "epoch": 0.21224705029332278, + "grad_norm": 1.390625, + "learning_rate": 0.00017888903549280908, + "loss": 0.0222, + "step": 805 + }, + { + "epoch": 0.21356535495352977, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00017875709196463915, + "loss": 0.0099, + "step": 810 + }, + { + "epoch": 0.21488365961373673, + "grad_norm": 0.10009765625, + "learning_rate": 0.0001786251484364692, + "loss": 0.0086, + "step": 815 + }, + { + "epoch": 0.2162019642739437, + "grad_norm": 0.06396484375, + "learning_rate": 0.00017849320490829926, + "loss": 0.0715, + "step": 820 + }, + { + "epoch": 0.2175202689341507, + "grad_norm": 0.365234375, + "learning_rate": 0.00017836126138012933, + "loss": 0.0642, + "step": 825 + }, + { + "epoch": 0.21883857359435765, + "grad_norm": 0.01519775390625, + "learning_rate": 0.00017822931785195937, + "loss": 0.0111, + "step": 830 + }, + { + "epoch": 0.22015687825456462, + "grad_norm": 1.1640625, + "learning_rate": 0.00017809737432378944, + "loss": 0.0518, + "step": 835 + }, + { + "epoch": 0.2214751829147716, + "grad_norm": 0.00921630859375, + "learning_rate": 0.00017796543079561948, + "loss": 0.0384, + "step": 840 + }, + { + "epoch": 0.22279348757497858, + "grad_norm": 0.33984375, + "learning_rate": 0.00017783348726744955, + "loss": 0.0204, + "step": 845 + }, + { + "epoch": 0.22411179223518554, + "grad_norm": 0.294921875, + "learning_rate": 0.00017770154373927962, + "loss": 0.0075, + "step": 850 + }, + { + "epoch": 0.22543009689539253, + "grad_norm": 0.033203125, + "learning_rate": 0.00017756960021110963, + "loss": 0.0895, + "step": 855 + }, + { + "epoch": 0.2267484015555995, + "grad_norm": 0.08056640625, + "learning_rate": 0.0001774376566829397, + "loss": 0.1039, + "step": 860 + }, + { + "epoch": 0.22806670621580646, + "grad_norm": 0.55078125, + "learning_rate": 0.00017730571315476975, + "loss": 0.0125, + "step": 865 + }, + { + "epoch": 0.22938501087601346, + "grad_norm": 0.5859375, + "learning_rate": 0.00017717376962659982, + "loss": 0.0381, + "step": 870 + }, + { + "epoch": 0.23070331553622042, + "grad_norm": 0.029052734375, + "learning_rate": 0.00017704182609842988, + "loss": 0.0434, + "step": 875 + }, + { + "epoch": 0.23202162019642739, + "grad_norm": 0.43359375, + "learning_rate": 0.00017690988257025993, + "loss": 0.0799, + "step": 880 + }, + { + "epoch": 0.23333992485663438, + "grad_norm": 0.04150390625, + "learning_rate": 0.00017677793904209, + "loss": 0.0692, + "step": 885 + }, + { + "epoch": 0.23465822951684134, + "grad_norm": 0.435546875, + "learning_rate": 0.00017664599551392004, + "loss": 0.0544, + "step": 890 + }, + { + "epoch": 0.2359765341770483, + "grad_norm": 1.171875, + "learning_rate": 0.0001765140519857501, + "loss": 0.0619, + "step": 895 + }, + { + "epoch": 0.2372948388372553, + "grad_norm": 0.01263427734375, + "learning_rate": 0.00017638210845758018, + "loss": 0.0418, + "step": 900 + }, + { + "epoch": 0.23861314349746227, + "grad_norm": 0.017578125, + "learning_rate": 0.00017625016492941022, + "loss": 0.0195, + "step": 905 + }, + { + "epoch": 0.23993144815766923, + "grad_norm": 0.6171875, + "learning_rate": 0.0001761182214012403, + "loss": 0.067, + "step": 910 + }, + { + "epoch": 0.24124975281787622, + "grad_norm": 0.59765625, + "learning_rate": 0.00017598627787307033, + "loss": 0.049, + "step": 915 + }, + { + "epoch": 0.2425680574780832, + "grad_norm": 1.2421875, + "learning_rate": 0.0001758543343449004, + "loss": 0.0539, + "step": 920 + }, + { + "epoch": 0.24388636213829015, + "grad_norm": 0.10302734375, + "learning_rate": 0.00017572239081673044, + "loss": 0.0725, + "step": 925 + }, + { + "epoch": 0.24520466679849715, + "grad_norm": 0.330078125, + "learning_rate": 0.0001755904472885605, + "loss": 0.064, + "step": 930 + }, + { + "epoch": 0.2465229714587041, + "grad_norm": 0.220703125, + "learning_rate": 0.00017545850376039058, + "loss": 0.0271, + "step": 935 + }, + { + "epoch": 0.24784127611891107, + "grad_norm": 0.01470947265625, + "learning_rate": 0.00017532656023222062, + "loss": 0.0247, + "step": 940 + }, + { + "epoch": 0.24915958077911807, + "grad_norm": 0.013427734375, + "learning_rate": 0.0001751946167040507, + "loss": 0.017, + "step": 945 + }, + { + "epoch": 0.25047788543932503, + "grad_norm": 0.58984375, + "learning_rate": 0.00017506267317588073, + "loss": 0.0254, + "step": 950 + }, + { + "epoch": 0.251796190099532, + "grad_norm": 0.412109375, + "learning_rate": 0.00017493072964771078, + "loss": 0.0186, + "step": 955 + }, + { + "epoch": 0.25311449475973896, + "grad_norm": 0.66796875, + "learning_rate": 0.00017479878611954084, + "loss": 0.0617, + "step": 960 + }, + { + "epoch": 0.25443279941994595, + "grad_norm": 0.322265625, + "learning_rate": 0.00017466684259137089, + "loss": 0.0173, + "step": 965 + }, + { + "epoch": 0.25575110408015295, + "grad_norm": 0.83203125, + "learning_rate": 0.00017453489906320096, + "loss": 0.0512, + "step": 970 + }, + { + "epoch": 0.2570694087403599, + "grad_norm": 0.08447265625, + "learning_rate": 0.000174402955535031, + "loss": 0.0361, + "step": 975 + }, + { + "epoch": 0.2583877134005669, + "grad_norm": 0.423828125, + "learning_rate": 0.00017427101200686107, + "loss": 0.0175, + "step": 980 + }, + { + "epoch": 0.25970601806077387, + "grad_norm": 0.77734375, + "learning_rate": 0.00017413906847869114, + "loss": 0.0139, + "step": 985 + }, + { + "epoch": 0.2610243227209808, + "grad_norm": 0.515625, + "learning_rate": 0.00017400712495052118, + "loss": 0.0948, + "step": 990 + }, + { + "epoch": 0.2623426273811878, + "grad_norm": 1.421875, + "learning_rate": 0.00017387518142235125, + "loss": 0.0406, + "step": 995 + }, + { + "epoch": 0.2636609320413948, + "grad_norm": 0.058837890625, + "learning_rate": 0.0001737432378941813, + "loss": 0.1011, + "step": 1000 + }, + { + "epoch": 0.2636609320413948, + "eval_loss": 0.045552924275398254, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.6113, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 3.684, + "step": 1000 + }, + { + "epoch": 0.26497923670160173, + "grad_norm": 0.380859375, + "learning_rate": 0.00017361129436601136, + "loss": 0.0711, + "step": 1005 + }, + { + "epoch": 0.2662975413618087, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00017347935083784143, + "loss": 0.0218, + "step": 1010 + }, + { + "epoch": 0.2676158460220157, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017334740730967147, + "loss": 0.0301, + "step": 1015 + }, + { + "epoch": 0.26893415068222265, + "grad_norm": 0.2734375, + "learning_rate": 0.00017321546378150154, + "loss": 0.0721, + "step": 1020 + }, + { + "epoch": 0.27025245534242964, + "grad_norm": 0.25390625, + "learning_rate": 0.00017308352025333158, + "loss": 0.0363, + "step": 1025 + }, + { + "epoch": 0.27157076000263664, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017295157672516165, + "loss": 0.0313, + "step": 1030 + }, + { + "epoch": 0.2728890646628436, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0001728196331969917, + "loss": 0.0385, + "step": 1035 + }, + { + "epoch": 0.27420736932305056, + "grad_norm": 0.00787353515625, + "learning_rate": 0.00017268768966882176, + "loss": 0.0405, + "step": 1040 + }, + { + "epoch": 0.27552567398325756, + "grad_norm": 0.484375, + "learning_rate": 0.00017255574614065183, + "loss": 0.0616, + "step": 1045 + }, + { + "epoch": 0.2768439786434645, + "grad_norm": 0.0908203125, + "learning_rate": 0.00017242380261248185, + "loss": 0.0057, + "step": 1050 + }, + { + "epoch": 0.2781622833036715, + "grad_norm": 0.1904296875, + "learning_rate": 0.00017229185908431192, + "loss": 0.0417, + "step": 1055 + }, + { + "epoch": 0.2794805879638785, + "grad_norm": 0.30078125, + "learning_rate": 0.00017215991555614196, + "loss": 0.0346, + "step": 1060 + }, + { + "epoch": 0.2807988926240854, + "grad_norm": 0.016357421875, + "learning_rate": 0.00017202797202797203, + "loss": 0.0295, + "step": 1065 + }, + { + "epoch": 0.2821171972842924, + "grad_norm": 0.490234375, + "learning_rate": 0.0001718960284998021, + "loss": 0.0448, + "step": 1070 + }, + { + "epoch": 0.28343550194449935, + "grad_norm": 0.004241943359375, + "learning_rate": 0.00017176408497163214, + "loss": 0.0051, + "step": 1075 + }, + { + "epoch": 0.28475380660470634, + "grad_norm": 0.01904296875, + "learning_rate": 0.0001716321414434622, + "loss": 0.0894, + "step": 1080 + }, + { + "epoch": 0.28607211126491333, + "grad_norm": 0.83984375, + "learning_rate": 0.00017150019791529225, + "loss": 0.0288, + "step": 1085 + }, + { + "epoch": 0.28739041592512027, + "grad_norm": 0.2021484375, + "learning_rate": 0.00017136825438712232, + "loss": 0.0222, + "step": 1090 + }, + { + "epoch": 0.28870872058532726, + "grad_norm": 0.322265625, + "learning_rate": 0.0001712363108589524, + "loss": 0.0444, + "step": 1095 + }, + { + "epoch": 0.29002702524553425, + "grad_norm": 0.408203125, + "learning_rate": 0.00017110436733078243, + "loss": 0.0828, + "step": 1100 + }, + { + "epoch": 0.2913453299057412, + "grad_norm": 0.04052734375, + "learning_rate": 0.0001709724238026125, + "loss": 0.0725, + "step": 1105 + }, + { + "epoch": 0.2926636345659482, + "grad_norm": 0.2578125, + "learning_rate": 0.00017084048027444254, + "loss": 0.0204, + "step": 1110 + }, + { + "epoch": 0.2939819392261552, + "grad_norm": 0.67578125, + "learning_rate": 0.0001707085367462726, + "loss": 0.0503, + "step": 1115 + }, + { + "epoch": 0.2953002438863621, + "grad_norm": 0.0059814453125, + "learning_rate": 0.00017057659321810265, + "loss": 0.0144, + "step": 1120 + }, + { + "epoch": 0.2966185485465691, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00017044464968993272, + "loss": 0.0044, + "step": 1125 + }, + { + "epoch": 0.2979368532067761, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001703127061617628, + "loss": 0.013, + "step": 1130 + }, + { + "epoch": 0.29925515786698303, + "grad_norm": 0.287109375, + "learning_rate": 0.00017018076263359283, + "loss": 0.0245, + "step": 1135 + }, + { + "epoch": 0.30057346252719, + "grad_norm": 0.26171875, + "learning_rate": 0.0001700488191054229, + "loss": 0.0247, + "step": 1140 + }, + { + "epoch": 0.301891767187397, + "grad_norm": 0.40625, + "learning_rate": 0.00016991687557725294, + "loss": 0.0402, + "step": 1145 + }, + { + "epoch": 0.30321007184760396, + "grad_norm": 1.2578125, + "learning_rate": 0.000169784932049083, + "loss": 0.0071, + "step": 1150 + }, + { + "epoch": 0.30452837650781095, + "grad_norm": 0.330078125, + "learning_rate": 0.00016965298852091306, + "loss": 0.0177, + "step": 1155 + }, + { + "epoch": 0.30584668116801794, + "grad_norm": 0.07275390625, + "learning_rate": 0.0001695210449927431, + "loss": 0.0029, + "step": 1160 + }, + { + "epoch": 0.3071649858282249, + "grad_norm": 0.455078125, + "learning_rate": 0.00016938910146457317, + "loss": 0.0262, + "step": 1165 + }, + { + "epoch": 0.30848329048843187, + "grad_norm": 0.002655029296875, + "learning_rate": 0.0001692571579364032, + "loss": 0.0346, + "step": 1170 + }, + { + "epoch": 0.30980159514863886, + "grad_norm": 0.1748046875, + "learning_rate": 0.00016912521440823328, + "loss": 0.0494, + "step": 1175 + }, + { + "epoch": 0.3111198998088458, + "grad_norm": 1.4609375, + "learning_rate": 0.00016899327088006335, + "loss": 0.0603, + "step": 1180 + }, + { + "epoch": 0.3124382044690528, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001688613273518934, + "loss": 0.0366, + "step": 1185 + }, + { + "epoch": 0.3137565091292598, + "grad_norm": 0.01422119140625, + "learning_rate": 0.00016872938382372346, + "loss": 0.0678, + "step": 1190 + }, + { + "epoch": 0.3150748137894667, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001685974402955535, + "loss": 0.0359, + "step": 1195 + }, + { + "epoch": 0.3163931184496737, + "grad_norm": 0.275390625, + "learning_rate": 0.00016846549676738357, + "loss": 0.1099, + "step": 1200 + }, + { + "epoch": 0.3177114231098807, + "grad_norm": 0.212890625, + "learning_rate": 0.00016833355323921364, + "loss": 0.0343, + "step": 1205 + }, + { + "epoch": 0.31902972777008765, + "grad_norm": 0.0302734375, + "learning_rate": 0.00016820160971104368, + "loss": 0.0138, + "step": 1210 + }, + { + "epoch": 0.32034803243029464, + "grad_norm": 0.016845703125, + "learning_rate": 0.00016806966618287375, + "loss": 0.0202, + "step": 1215 + }, + { + "epoch": 0.32166633709050163, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001679377226547038, + "loss": 0.0442, + "step": 1220 + }, + { + "epoch": 0.32298464175070857, + "grad_norm": 0.049072265625, + "learning_rate": 0.00016780577912653386, + "loss": 0.0375, + "step": 1225 + }, + { + "epoch": 0.32430294641091556, + "grad_norm": 0.1337890625, + "learning_rate": 0.0001676738355983639, + "loss": 0.01, + "step": 1230 + }, + { + "epoch": 0.32562125107112255, + "grad_norm": 0.02197265625, + "learning_rate": 0.00016754189207019397, + "loss": 0.0139, + "step": 1235 + }, + { + "epoch": 0.3269395557313295, + "grad_norm": 0.09228515625, + "learning_rate": 0.00016740994854202404, + "loss": 0.014, + "step": 1240 + }, + { + "epoch": 0.3282578603915365, + "grad_norm": 0.47265625, + "learning_rate": 0.00016727800501385408, + "loss": 0.1546, + "step": 1245 + }, + { + "epoch": 0.3295761650517435, + "grad_norm": 0.02294921875, + "learning_rate": 0.00016714606148568413, + "loss": 0.0803, + "step": 1250 + }, + { + "epoch": 0.3308944697119504, + "grad_norm": 0.185546875, + "learning_rate": 0.00016701411795751417, + "loss": 0.0376, + "step": 1255 + }, + { + "epoch": 0.3322127743721574, + "grad_norm": 0.1123046875, + "learning_rate": 0.00016688217442934424, + "loss": 0.0375, + "step": 1260 + }, + { + "epoch": 0.3335310790323644, + "grad_norm": 1.03125, + "learning_rate": 0.0001667502309011743, + "loss": 0.0442, + "step": 1265 + }, + { + "epoch": 0.33484938369257133, + "grad_norm": 0.0172119140625, + "learning_rate": 0.00016661828737300435, + "loss": 0.0261, + "step": 1270 + }, + { + "epoch": 0.3361676883527783, + "grad_norm": 0.42578125, + "learning_rate": 0.00016648634384483442, + "loss": 0.0553, + "step": 1275 + }, + { + "epoch": 0.3374859930129853, + "grad_norm": 0.1328125, + "learning_rate": 0.00016635440031666446, + "loss": 0.0065, + "step": 1280 + }, + { + "epoch": 0.33880429767319226, + "grad_norm": 0.263671875, + "learning_rate": 0.00016622245678849453, + "loss": 0.0527, + "step": 1285 + }, + { + "epoch": 0.34012260233339925, + "grad_norm": 0.314453125, + "learning_rate": 0.0001660905132603246, + "loss": 0.0297, + "step": 1290 + }, + { + "epoch": 0.34144090699360624, + "grad_norm": 0.04345703125, + "learning_rate": 0.00016595856973215464, + "loss": 0.0477, + "step": 1295 + }, + { + "epoch": 0.3427592116538132, + "grad_norm": 0.08154296875, + "learning_rate": 0.0001658266262039847, + "loss": 0.0298, + "step": 1300 + }, + { + "epoch": 0.34407751631402017, + "grad_norm": 0.08935546875, + "learning_rate": 0.00016569468267581475, + "loss": 0.0481, + "step": 1305 + }, + { + "epoch": 0.34539582097422716, + "grad_norm": 0.06640625, + "learning_rate": 0.00016556273914764482, + "loss": 0.0153, + "step": 1310 + }, + { + "epoch": 0.3467141256344341, + "grad_norm": 0.00592041015625, + "learning_rate": 0.00016543079561947486, + "loss": 0.0111, + "step": 1315 + }, + { + "epoch": 0.3480324302946411, + "grad_norm": 0.2236328125, + "learning_rate": 0.00016529885209130493, + "loss": 0.0309, + "step": 1320 + }, + { + "epoch": 0.3493507349548481, + "grad_norm": 0.0198974609375, + "learning_rate": 0.000165166908563135, + "loss": 0.0579, + "step": 1325 + }, + { + "epoch": 0.350669039615055, + "grad_norm": 0.10107421875, + "learning_rate": 0.00016503496503496504, + "loss": 0.0055, + "step": 1330 + }, + { + "epoch": 0.351987344275262, + "grad_norm": 0.71875, + "learning_rate": 0.00016490302150679511, + "loss": 0.0299, + "step": 1335 + }, + { + "epoch": 0.353305648935469, + "grad_norm": 0.01348876953125, + "learning_rate": 0.00016477107797862516, + "loss": 0.0943, + "step": 1340 + }, + { + "epoch": 0.35462395359567594, + "grad_norm": 0.3046875, + "learning_rate": 0.00016463913445045523, + "loss": 0.0216, + "step": 1345 + }, + { + "epoch": 0.35594225825588294, + "grad_norm": 0.02392578125, + "learning_rate": 0.00016450719092228527, + "loss": 0.0265, + "step": 1350 + }, + { + "epoch": 0.35726056291608993, + "grad_norm": 0.453125, + "learning_rate": 0.0001643752473941153, + "loss": 0.0539, + "step": 1355 + }, + { + "epoch": 0.35857886757629687, + "grad_norm": 0.00823974609375, + "learning_rate": 0.00016424330386594538, + "loss": 0.0139, + "step": 1360 + }, + { + "epoch": 0.35989717223650386, + "grad_norm": 0.55859375, + "learning_rate": 0.00016411136033777542, + "loss": 0.0428, + "step": 1365 + }, + { + "epoch": 0.36121547689671085, + "grad_norm": 0.052734375, + "learning_rate": 0.0001639794168096055, + "loss": 0.0346, + "step": 1370 + }, + { + "epoch": 0.3625337815569178, + "grad_norm": 0.12158203125, + "learning_rate": 0.00016384747328143556, + "loss": 0.0095, + "step": 1375 + }, + { + "epoch": 0.3638520862171248, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0001637155297532656, + "loss": 0.0224, + "step": 1380 + }, + { + "epoch": 0.3651703908773318, + "grad_norm": 0.01318359375, + "learning_rate": 0.00016358358622509567, + "loss": 0.0316, + "step": 1385 + }, + { + "epoch": 0.3664886955375387, + "grad_norm": 0.011962890625, + "learning_rate": 0.0001634516426969257, + "loss": 0.0051, + "step": 1390 + }, + { + "epoch": 0.3678070001977457, + "grad_norm": 0.00396728515625, + "learning_rate": 0.00016331969916875578, + "loss": 0.038, + "step": 1395 + }, + { + "epoch": 0.3691253048579527, + "grad_norm": 0.375, + "learning_rate": 0.00016318775564058585, + "loss": 0.029, + "step": 1400 + }, + { + "epoch": 0.37044360951815963, + "grad_norm": 0.265625, + "learning_rate": 0.0001630558121124159, + "loss": 0.0072, + "step": 1405 + }, + { + "epoch": 0.3717619141783666, + "grad_norm": 0.00127410888671875, + "learning_rate": 0.00016292386858424596, + "loss": 0.0381, + "step": 1410 + }, + { + "epoch": 0.3730802188385736, + "grad_norm": 1.15625, + "learning_rate": 0.000162791925056076, + "loss": 0.0573, + "step": 1415 + }, + { + "epoch": 0.37439852349878056, + "grad_norm": 0.0244140625, + "learning_rate": 0.00016265998152790607, + "loss": 0.051, + "step": 1420 + }, + { + "epoch": 0.37571682815898755, + "grad_norm": 0.0015106201171875, + "learning_rate": 0.00016252803799973612, + "loss": 0.0239, + "step": 1425 + }, + { + "epoch": 0.37703513281919454, + "grad_norm": 0.26953125, + "learning_rate": 0.00016239609447156618, + "loss": 0.0165, + "step": 1430 + }, + { + "epoch": 0.3783534374794015, + "grad_norm": 0.006134033203125, + "learning_rate": 0.00016226415094339625, + "loss": 0.0071, + "step": 1435 + }, + { + "epoch": 0.37967174213960847, + "grad_norm": 2.828125, + "learning_rate": 0.0001621322074152263, + "loss": 0.0272, + "step": 1440 + }, + { + "epoch": 0.38099004679981546, + "grad_norm": 0.349609375, + "learning_rate": 0.00016200026388705637, + "loss": 0.0647, + "step": 1445 + }, + { + "epoch": 0.3823083514600224, + "grad_norm": 0.09326171875, + "learning_rate": 0.00016186832035888638, + "loss": 0.0262, + "step": 1450 + }, + { + "epoch": 0.3836266561202294, + "grad_norm": 0.041015625, + "learning_rate": 0.00016173637683071645, + "loss": 0.0576, + "step": 1455 + }, + { + "epoch": 0.3849449607804364, + "grad_norm": 0.033935546875, + "learning_rate": 0.00016160443330254652, + "loss": 0.0142, + "step": 1460 + }, + { + "epoch": 0.3862632654406433, + "grad_norm": 0.09130859375, + "learning_rate": 0.00016147248977437656, + "loss": 0.0348, + "step": 1465 + }, + { + "epoch": 0.3875815701008503, + "grad_norm": 2.390625, + "learning_rate": 0.00016134054624620663, + "loss": 0.0672, + "step": 1470 + }, + { + "epoch": 0.3888998747610573, + "grad_norm": 0.439453125, + "learning_rate": 0.00016120860271803667, + "loss": 0.0121, + "step": 1475 + }, + { + "epoch": 0.39021817942126424, + "grad_norm": 0.1298828125, + "learning_rate": 0.00016107665918986674, + "loss": 0.0114, + "step": 1480 + }, + { + "epoch": 0.39153648408147124, + "grad_norm": 0.85546875, + "learning_rate": 0.0001609447156616968, + "loss": 0.0968, + "step": 1485 + }, + { + "epoch": 0.39285478874167823, + "grad_norm": 0.703125, + "learning_rate": 0.00016081277213352685, + "loss": 0.0349, + "step": 1490 + }, + { + "epoch": 0.39417309340188517, + "grad_norm": 0.021728515625, + "learning_rate": 0.00016068082860535692, + "loss": 0.0106, + "step": 1495 + }, + { + "epoch": 0.39549139806209216, + "grad_norm": 0.7265625, + "learning_rate": 0.00016054888507718696, + "loss": 0.0225, + "step": 1500 + }, + { + "epoch": 0.39549139806209216, + "eval_loss": 0.03515048325061798, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.3497, + "eval_samples_per_second": 7.373, + "eval_steps_per_second": 3.686, + "step": 1500 + }, + { + "epoch": 0.3968097027222991, + "grad_norm": 0.016519820317626, + "learning_rate": 0.00016041694154901703, + "loss": 0.0202, + "step": 1505 + }, + { + "epoch": 0.3981280073825061, + "grad_norm": 0.8505942225456238, + "learning_rate": 0.00016028499802084708, + "loss": 0.0541, + "step": 1510 + }, + { + "epoch": 0.3994463120427131, + "grad_norm": 0.04163295030593872, + "learning_rate": 0.00016015305449267714, + "loss": 0.0037, + "step": 1515 + }, + { + "epoch": 0.40076461670292, + "grad_norm": 0.011332935653626919, + "learning_rate": 0.00016002111096450721, + "loss": 0.0459, + "step": 1520 + }, + { + "epoch": 0.402082921363127, + "grad_norm": 0.9360129833221436, + "learning_rate": 0.00015988916743633726, + "loss": 0.013, + "step": 1525 + }, + { + "epoch": 0.403401226023334, + "grad_norm": 0.11991436779499054, + "learning_rate": 0.00015975722390816733, + "loss": 0.0079, + "step": 1530 + }, + { + "epoch": 0.40471953068354094, + "grad_norm": 0.36911076307296753, + "learning_rate": 0.00015962528037999737, + "loss": 0.0638, + "step": 1535 + }, + { + "epoch": 0.40603783534374793, + "grad_norm": 0.020278634503483772, + "learning_rate": 0.00015949333685182744, + "loss": 0.0217, + "step": 1540 + }, + { + "epoch": 0.4073561400039549, + "grad_norm": 0.14263059198856354, + "learning_rate": 0.0001593613933236575, + "loss": 0.0495, + "step": 1545 + }, + { + "epoch": 0.40867444466416186, + "grad_norm": 0.09494803845882416, + "learning_rate": 0.00015922944979548752, + "loss": 0.0248, + "step": 1550 + }, + { + "epoch": 0.40999274932436885, + "grad_norm": 0.23064319789409637, + "learning_rate": 0.0001590975062673176, + "loss": 0.0285, + "step": 1555 + }, + { + "epoch": 0.41131105398457585, + "grad_norm": 0.32220256328582764, + "learning_rate": 0.00015896556273914763, + "loss": 0.0537, + "step": 1560 + }, + { + "epoch": 0.4126293586447828, + "grad_norm": 0.41208815574645996, + "learning_rate": 0.0001588336192109777, + "loss": 0.0453, + "step": 1565 + }, + { + "epoch": 0.4139476633049898, + "grad_norm": 0.03775424137711525, + "learning_rate": 0.00015870167568280777, + "loss": 0.0134, + "step": 1570 + }, + { + "epoch": 0.41526596796519677, + "grad_norm": 0.6526333093643188, + "learning_rate": 0.0001585697321546378, + "loss": 0.0329, + "step": 1575 + }, + { + "epoch": 0.4165842726254037, + "grad_norm": 1.001305103302002, + "learning_rate": 0.00015843778862646788, + "loss": 0.0912, + "step": 1580 + }, + { + "epoch": 0.4179025772856107, + "grad_norm": 0.4055219888687134, + "learning_rate": 0.00015830584509829792, + "loss": 0.0519, + "step": 1585 + }, + { + "epoch": 0.4192208819458177, + "grad_norm": 0.035015616565942764, + "learning_rate": 0.000158173901570128, + "loss": 0.0191, + "step": 1590 + }, + { + "epoch": 0.42053918660602463, + "grad_norm": 0.09326844662427902, + "learning_rate": 0.00015804195804195806, + "loss": 0.0106, + "step": 1595 + }, + { + "epoch": 0.4218574912662316, + "grad_norm": 0.06223440542817116, + "learning_rate": 0.0001579100145137881, + "loss": 0.0113, + "step": 1600 + }, + { + "epoch": 0.4231757959264386, + "grad_norm": 0.0625135526061058, + "learning_rate": 0.00015777807098561817, + "loss": 0.0191, + "step": 1605 + }, + { + "epoch": 0.42449410058664555, + "grad_norm": 0.2645983099937439, + "learning_rate": 0.00015764612745744822, + "loss": 0.0829, + "step": 1610 + }, + { + "epoch": 0.42581240524685254, + "grad_norm": 0.009632415138185024, + "learning_rate": 0.00015751418392927829, + "loss": 0.0542, + "step": 1615 + }, + { + "epoch": 0.42713070990705954, + "grad_norm": 0.01979319378733635, + "learning_rate": 0.00015738224040110833, + "loss": 0.0517, + "step": 1620 + }, + { + "epoch": 0.4284490145672665, + "grad_norm": 0.3065454065799713, + "learning_rate": 0.0001572502968729384, + "loss": 0.0738, + "step": 1625 + }, + { + "epoch": 0.42976731922747347, + "grad_norm": 0.09581473469734192, + "learning_rate": 0.00015711835334476847, + "loss": 0.0571, + "step": 1630 + }, + { + "epoch": 0.43108562388768046, + "grad_norm": 0.23746591806411743, + "learning_rate": 0.0001569864098165985, + "loss": 0.0128, + "step": 1635 + }, + { + "epoch": 0.4324039285478874, + "grad_norm": 0.936278760433197, + "learning_rate": 0.00015685446628842858, + "loss": 0.0665, + "step": 1640 + }, + { + "epoch": 0.4337222332080944, + "grad_norm": 0.18487441539764404, + "learning_rate": 0.00015672252276025862, + "loss": 0.0527, + "step": 1645 + }, + { + "epoch": 0.4350405378683014, + "grad_norm": 0.6980624794960022, + "learning_rate": 0.00015659057923208866, + "loss": 0.0613, + "step": 1650 + }, + { + "epoch": 0.4363588425285083, + "grad_norm": 0.4696301221847534, + "learning_rate": 0.00015645863570391873, + "loss": 0.0569, + "step": 1655 + }, + { + "epoch": 0.4376771471887153, + "grad_norm": 0.15083105862140656, + "learning_rate": 0.00015632669217574877, + "loss": 0.0394, + "step": 1660 + }, + { + "epoch": 0.4389954518489223, + "grad_norm": 0.44701239466667175, + "learning_rate": 0.00015619474864757884, + "loss": 0.0494, + "step": 1665 + }, + { + "epoch": 0.44031375650912924, + "grad_norm": 0.07418403029441833, + "learning_rate": 0.00015606280511940888, + "loss": 0.0291, + "step": 1670 + }, + { + "epoch": 0.44163206116933623, + "grad_norm": 0.02311861515045166, + "learning_rate": 0.00015593086159123895, + "loss": 0.0304, + "step": 1675 + }, + { + "epoch": 0.4429503658295432, + "grad_norm": 0.4416038990020752, + "learning_rate": 0.00015579891806306902, + "loss": 0.0176, + "step": 1680 + }, + { + "epoch": 0.44426867048975016, + "grad_norm": 0.5124915242195129, + "learning_rate": 0.00015566697453489906, + "loss": 0.0454, + "step": 1685 + }, + { + "epoch": 0.44558697514995715, + "grad_norm": 0.3159286081790924, + "learning_rate": 0.00015553503100672913, + "loss": 0.047, + "step": 1690 + }, + { + "epoch": 0.44690527981016415, + "grad_norm": 0.032126396894454956, + "learning_rate": 0.00015540308747855918, + "loss": 0.0151, + "step": 1695 + }, + { + "epoch": 0.4482235844703711, + "grad_norm": 0.04663548618555069, + "learning_rate": 0.00015527114395038924, + "loss": 0.0375, + "step": 1700 + }, + { + "epoch": 0.4495418891305781, + "grad_norm": 0.013753900304436684, + "learning_rate": 0.0001551392004222193, + "loss": 0.0485, + "step": 1705 + }, + { + "epoch": 0.45086019379078507, + "grad_norm": 1.9952393770217896, + "learning_rate": 0.00015500725689404936, + "loss": 0.0625, + "step": 1710 + }, + { + "epoch": 0.452178498450992, + "grad_norm": 0.014283270575106144, + "learning_rate": 0.00015487531336587943, + "loss": 0.0037, + "step": 1715 + }, + { + "epoch": 0.453496803111199, + "grad_norm": 0.3897913098335266, + "learning_rate": 0.00015474336983770947, + "loss": 0.0304, + "step": 1720 + }, + { + "epoch": 0.454815107771406, + "grad_norm": 0.3730885684490204, + "learning_rate": 0.00015461142630953954, + "loss": 0.0115, + "step": 1725 + }, + { + "epoch": 0.45613341243161293, + "grad_norm": 0.035858724266290665, + "learning_rate": 0.00015447948278136958, + "loss": 0.0021, + "step": 1730 + }, + { + "epoch": 0.4574517170918199, + "grad_norm": 0.20589517056941986, + "learning_rate": 0.00015434753925319965, + "loss": 0.0132, + "step": 1735 + }, + { + "epoch": 0.4587700217520269, + "grad_norm": 0.004939342383295298, + "learning_rate": 0.00015421559572502972, + "loss": 0.0471, + "step": 1740 + }, + { + "epoch": 0.46008832641223385, + "grad_norm": 0.03493283689022064, + "learning_rate": 0.00015408365219685976, + "loss": 0.0062, + "step": 1745 + }, + { + "epoch": 0.46140663107244084, + "grad_norm": 0.045927103608846664, + "learning_rate": 0.0001539517086686898, + "loss": 0.0283, + "step": 1750 + }, + { + "epoch": 0.46272493573264784, + "grad_norm": 0.012629454955458641, + "learning_rate": 0.00015381976514051984, + "loss": 0.0133, + "step": 1755 + }, + { + "epoch": 0.46404324039285477, + "grad_norm": 0.8001697659492493, + "learning_rate": 0.0001536878216123499, + "loss": 0.0224, + "step": 1760 + }, + { + "epoch": 0.46536154505306176, + "grad_norm": 0.002036362886428833, + "learning_rate": 0.00015355587808417998, + "loss": 0.0066, + "step": 1765 + }, + { + "epoch": 0.46667984971326876, + "grad_norm": 1.0261330604553223, + "learning_rate": 0.00015342393455601002, + "loss": 0.191, + "step": 1770 + }, + { + "epoch": 0.4679981543734757, + "grad_norm": 0.3033429682254791, + "learning_rate": 0.0001532919910278401, + "loss": 0.0222, + "step": 1775 + }, + { + "epoch": 0.4693164590336827, + "grad_norm": 0.36911338567733765, + "learning_rate": 0.00015316004749967014, + "loss": 0.0363, + "step": 1780 + }, + { + "epoch": 0.4706347636938897, + "grad_norm": 0.0406811460852623, + "learning_rate": 0.0001530281039715002, + "loss": 0.0283, + "step": 1785 + }, + { + "epoch": 0.4719530683540966, + "grad_norm": 0.23334211111068726, + "learning_rate": 0.00015289616044333027, + "loss": 0.0274, + "step": 1790 + }, + { + "epoch": 0.4732713730143036, + "grad_norm": 0.013081169687211514, + "learning_rate": 0.00015276421691516032, + "loss": 0.0221, + "step": 1795 + }, + { + "epoch": 0.4745896776745106, + "grad_norm": 0.2480790615081787, + "learning_rate": 0.00015263227338699039, + "loss": 0.019, + "step": 1800 + }, + { + "epoch": 0.47590798233471754, + "grad_norm": 0.0373196005821228, + "learning_rate": 0.00015250032985882043, + "loss": 0.0292, + "step": 1805 + }, + { + "epoch": 0.47722628699492453, + "grad_norm": 0.004609994124621153, + "learning_rate": 0.0001523683863306505, + "loss": 0.0918, + "step": 1810 + }, + { + "epoch": 0.4785445916551315, + "grad_norm": 0.02370987832546234, + "learning_rate": 0.00015223644280248054, + "loss": 0.0462, + "step": 1815 + }, + { + "epoch": 0.47986289631533846, + "grad_norm": 0.05842221528291702, + "learning_rate": 0.0001521044992743106, + "loss": 0.0595, + "step": 1820 + }, + { + "epoch": 0.48118120097554545, + "grad_norm": 0.009685276076197624, + "learning_rate": 0.00015197255574614068, + "loss": 0.0074, + "step": 1825 + }, + { + "epoch": 0.48249950563575245, + "grad_norm": 0.8933250308036804, + "learning_rate": 0.00015184061221797072, + "loss": 0.0757, + "step": 1830 + }, + { + "epoch": 0.4838178102959594, + "grad_norm": 0.07075401395559311, + "learning_rate": 0.0001517086686898008, + "loss": 0.0226, + "step": 1835 + }, + { + "epoch": 0.4851361149561664, + "grad_norm": 0.732706606388092, + "learning_rate": 0.00015157672516163083, + "loss": 0.0161, + "step": 1840 + }, + { + "epoch": 0.48645441961637337, + "grad_norm": 1.1897023916244507, + "learning_rate": 0.0001514447816334609, + "loss": 0.0265, + "step": 1845 + }, + { + "epoch": 0.4877727242765803, + "grad_norm": 0.052572328597307205, + "learning_rate": 0.00015131283810529094, + "loss": 0.0094, + "step": 1850 + }, + { + "epoch": 0.4890910289367873, + "grad_norm": 0.08263898640871048, + "learning_rate": 0.00015118089457712098, + "loss": 0.0631, + "step": 1855 + }, + { + "epoch": 0.4904093335969943, + "grad_norm": 0.03225664421916008, + "learning_rate": 0.00015104895104895105, + "loss": 0.023, + "step": 1860 + }, + { + "epoch": 0.4917276382572012, + "grad_norm": 0.007935039699077606, + "learning_rate": 0.0001509170075207811, + "loss": 0.0039, + "step": 1865 + }, + { + "epoch": 0.4930459429174082, + "grad_norm": 0.00830796267837286, + "learning_rate": 0.00015078506399261116, + "loss": 0.007, + "step": 1870 + }, + { + "epoch": 0.4943642475776152, + "grad_norm": 0.08042234182357788, + "learning_rate": 0.00015065312046444123, + "loss": 0.0366, + "step": 1875 + }, + { + "epoch": 0.49568255223782215, + "grad_norm": 0.009092851541936398, + "learning_rate": 0.00015052117693627128, + "loss": 0.0107, + "step": 1880 + }, + { + "epoch": 0.49700085689802914, + "grad_norm": 0.2674141824245453, + "learning_rate": 0.00015038923340810135, + "loss": 0.0076, + "step": 1885 + }, + { + "epoch": 0.49831916155823613, + "grad_norm": 0.07694366574287415, + "learning_rate": 0.0001502572898799314, + "loss": 0.0252, + "step": 1890 + }, + { + "epoch": 0.49963746621844307, + "grad_norm": 0.5699467062950134, + "learning_rate": 0.00015012534635176146, + "loss": 0.0487, + "step": 1895 + }, + { + "epoch": 0.5009557708786501, + "grad_norm": 0.18800878524780273, + "learning_rate": 0.0001499934028235915, + "loss": 0.0183, + "step": 1900 + }, + { + "epoch": 0.5022740755388571, + "grad_norm": 0.019469989463686943, + "learning_rate": 0.00014986145929542157, + "loss": 0.0268, + "step": 1905 + }, + { + "epoch": 0.503592380199064, + "grad_norm": 0.01890506222844124, + "learning_rate": 0.00014972951576725164, + "loss": 0.0449, + "step": 1910 + }, + { + "epoch": 0.5049106848592709, + "grad_norm": 0.0006314461352303624, + "learning_rate": 0.00014959757223908168, + "loss": 0.0056, + "step": 1915 + }, + { + "epoch": 0.5062289895194779, + "grad_norm": 0.32654041051864624, + "learning_rate": 0.00014946562871091175, + "loss": 0.0256, + "step": 1920 + }, + { + "epoch": 0.5075472941796849, + "grad_norm": 0.7803483605384827, + "learning_rate": 0.0001493336851827418, + "loss": 0.0374, + "step": 1925 + }, + { + "epoch": 0.5088655988398919, + "grad_norm": 0.028441445901989937, + "learning_rate": 0.00014920174165457186, + "loss": 0.0161, + "step": 1930 + }, + { + "epoch": 0.5101839035000989, + "grad_norm": 0.028379200026392937, + "learning_rate": 0.00014906979812640193, + "loss": 0.0151, + "step": 1935 + }, + { + "epoch": 0.5115022081603059, + "grad_norm": 0.021159596741199493, + "learning_rate": 0.00014893785459823197, + "loss": 0.0303, + "step": 1940 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.24903325736522675, + "learning_rate": 0.000148805911070062, + "loss": 0.0076, + "step": 1945 + }, + { + "epoch": 0.5141388174807198, + "grad_norm": 0.007065301761031151, + "learning_rate": 0.00014867396754189206, + "loss": 0.022, + "step": 1950 + }, + { + "epoch": 0.5154571221409268, + "grad_norm": 0.004032329190522432, + "learning_rate": 0.00014854202401372212, + "loss": 0.0083, + "step": 1955 + }, + { + "epoch": 0.5167754268011338, + "grad_norm": 0.3045775592327118, + "learning_rate": 0.0001484100804855522, + "loss": 0.0113, + "step": 1960 + }, + { + "epoch": 0.5180937314613407, + "grad_norm": 0.36974939703941345, + "learning_rate": 0.00014827813695738224, + "loss": 0.0267, + "step": 1965 + }, + { + "epoch": 0.5194120361215477, + "grad_norm": 0.009729950688779354, + "learning_rate": 0.0001481461934292123, + "loss": 0.027, + "step": 1970 + }, + { + "epoch": 0.5207303407817546, + "grad_norm": 0.0013097926275804639, + "learning_rate": 0.00014801424990104235, + "loss": 0.003, + "step": 1975 + }, + { + "epoch": 0.5220486454419616, + "grad_norm": 0.0706263929605484, + "learning_rate": 0.00014788230637287242, + "loss": 0.0193, + "step": 1980 + }, + { + "epoch": 0.5233669501021686, + "grad_norm": 1.435702919960022, + "learning_rate": 0.00014775036284470249, + "loss": 0.0647, + "step": 1985 + }, + { + "epoch": 0.5246852547623756, + "grad_norm": 0.00661757867783308, + "learning_rate": 0.00014761841931653253, + "loss": 0.0373, + "step": 1990 + }, + { + "epoch": 0.5260035594225826, + "grad_norm": 0.12014541029930115, + "learning_rate": 0.0001474864757883626, + "loss": 0.0178, + "step": 1995 + }, + { + "epoch": 0.5273218640827896, + "grad_norm": 1.0549248456954956, + "learning_rate": 0.00014735453226019264, + "loss": 0.0191, + "step": 2000 + }, + { + "epoch": 0.5273218640827896, + "eval_loss": 0.037292081862688065, + "eval_runtime": 454.3033, + "eval_samples_per_second": 7.422, + "eval_steps_per_second": 3.711, + "step": 2000 + }, + { + "epoch": 0.5286401687429965, + "grad_norm": 0.47634151577949524, + "learning_rate": 0.0001472225887320227, + "loss": 0.0404, + "step": 2005 + }, + { + "epoch": 0.5299584734032035, + "grad_norm": 0.006752463988959789, + "learning_rate": 0.00014709064520385275, + "loss": 0.034, + "step": 2010 + }, + { + "epoch": 0.5312767780634104, + "grad_norm": 0.20780125260353088, + "learning_rate": 0.00014695870167568282, + "loss": 0.0421, + "step": 2015 + }, + { + "epoch": 0.5325950827236174, + "grad_norm": 0.010941066779196262, + "learning_rate": 0.0001468267581475129, + "loss": 0.0086, + "step": 2020 + }, + { + "epoch": 0.5339133873838244, + "grad_norm": 0.3439581096172333, + "learning_rate": 0.00014669481461934293, + "loss": 0.0187, + "step": 2025 + }, + { + "epoch": 0.5352316920440314, + "grad_norm": 0.14961636066436768, + "learning_rate": 0.000146562871091173, + "loss": 0.0504, + "step": 2030 + }, + { + "epoch": 0.5365499967042383, + "grad_norm": 0.0044641937129199505, + "learning_rate": 0.00014643092756300304, + "loss": 0.0134, + "step": 2035 + }, + { + "epoch": 0.5378683013644453, + "grad_norm": 0.14088386297225952, + "learning_rate": 0.0001462989840348331, + "loss": 0.0096, + "step": 2040 + }, + { + "epoch": 0.5391866060246523, + "grad_norm": 0.48116979002952576, + "learning_rate": 0.00014616704050666315, + "loss": 0.0124, + "step": 2045 + }, + { + "epoch": 0.5405049106848593, + "grad_norm": 0.3688766360282898, + "learning_rate": 0.0001460350969784932, + "loss": 0.0226, + "step": 2050 + }, + { + "epoch": 0.5418232153450663, + "grad_norm": 0.002938181860372424, + "learning_rate": 0.00014590315345032326, + "loss": 0.0267, + "step": 2055 + }, + { + "epoch": 0.5431415200052733, + "grad_norm": 0.3335214853286743, + "learning_rate": 0.0001457712099221533, + "loss": 0.0367, + "step": 2060 + }, + { + "epoch": 0.5444598246654802, + "grad_norm": 0.004644686821848154, + "learning_rate": 0.00014563926639398338, + "loss": 0.0121, + "step": 2065 + }, + { + "epoch": 0.5457781293256871, + "grad_norm": 0.19505545496940613, + "learning_rate": 0.00014550732286581345, + "loss": 0.0591, + "step": 2070 + }, + { + "epoch": 0.5470964339858941, + "grad_norm": 0.018028756603598595, + "learning_rate": 0.0001453753793376435, + "loss": 0.0131, + "step": 2075 + }, + { + "epoch": 0.5484147386461011, + "grad_norm": 0.045639291405677795, + "learning_rate": 0.00014524343580947356, + "loss": 0.0443, + "step": 2080 + }, + { + "epoch": 0.5497330433063081, + "grad_norm": 0.727981686592102, + "learning_rate": 0.0001451114922813036, + "loss": 0.0205, + "step": 2085 + }, + { + "epoch": 0.5510513479665151, + "grad_norm": 0.03766491636633873, + "learning_rate": 0.00014497954875313367, + "loss": 0.0067, + "step": 2090 + }, + { + "epoch": 0.552369652626722, + "grad_norm": 0.1911504715681076, + "learning_rate": 0.0001448476052249637, + "loss": 0.0397, + "step": 2095 + }, + { + "epoch": 0.553687957286929, + "grad_norm": 0.08238353580236435, + "learning_rate": 0.00014471566169679378, + "loss": 0.0513, + "step": 2100 + }, + { + "epoch": 0.555006261947136, + "grad_norm": 0.06317206472158432, + "learning_rate": 0.00014458371816862385, + "loss": 0.0178, + "step": 2105 + }, + { + "epoch": 0.556324566607343, + "grad_norm": 0.0652734637260437, + "learning_rate": 0.0001444517746404539, + "loss": 0.0184, + "step": 2110 + }, + { + "epoch": 0.55764287126755, + "grad_norm": 0.05471858009696007, + "learning_rate": 0.00014431983111228396, + "loss": 0.0089, + "step": 2115 + }, + { + "epoch": 0.558961175927757, + "grad_norm": 0.005062670446932316, + "learning_rate": 0.000144187887584114, + "loss": 0.0052, + "step": 2120 + }, + { + "epoch": 0.5602794805879638, + "grad_norm": 0.06337414681911469, + "learning_rate": 0.00014405594405594407, + "loss": 0.053, + "step": 2125 + }, + { + "epoch": 0.5615977852481708, + "grad_norm": 0.33745357394218445, + "learning_rate": 0.00014392400052777414, + "loss": 0.0166, + "step": 2130 + }, + { + "epoch": 0.5629160899083778, + "grad_norm": 0.7382741570472717, + "learning_rate": 0.00014379205699960418, + "loss": 0.0191, + "step": 2135 + }, + { + "epoch": 0.5642343945685848, + "grad_norm": 0.007551972754299641, + "learning_rate": 0.00014366011347143425, + "loss": 0.0022, + "step": 2140 + }, + { + "epoch": 0.5655526992287918, + "grad_norm": 0.6260896921157837, + "learning_rate": 0.00014352816994326427, + "loss": 0.0095, + "step": 2145 + }, + { + "epoch": 0.5668710038889987, + "grad_norm": 0.11619322001934052, + "learning_rate": 0.00014339622641509434, + "loss": 0.015, + "step": 2150 + }, + { + "epoch": 0.5681893085492057, + "grad_norm": 1.1440670490264893, + "learning_rate": 0.0001432642828869244, + "loss": 0.1343, + "step": 2155 + }, + { + "epoch": 0.5695076132094127, + "grad_norm": 1.1793878078460693, + "learning_rate": 0.00014313233935875445, + "loss": 0.0968, + "step": 2160 + }, + { + "epoch": 0.5708259178696197, + "grad_norm": 0.6865736842155457, + "learning_rate": 0.00014300039583058452, + "loss": 0.0195, + "step": 2165 + }, + { + "epoch": 0.5721442225298267, + "grad_norm": 0.140816792845726, + "learning_rate": 0.00014286845230241456, + "loss": 0.0761, + "step": 2170 + }, + { + "epoch": 0.5734625271900337, + "grad_norm": 0.04071786254644394, + "learning_rate": 0.00014273650877424463, + "loss": 0.0193, + "step": 2175 + }, + { + "epoch": 0.5747808318502405, + "grad_norm": 0.044617727398872375, + "learning_rate": 0.0001426045652460747, + "loss": 0.0112, + "step": 2180 + }, + { + "epoch": 0.5760991365104475, + "grad_norm": 0.11001799255609512, + "learning_rate": 0.00014247262171790474, + "loss": 0.0039, + "step": 2185 + }, + { + "epoch": 0.5774174411706545, + "grad_norm": 0.0036315324250608683, + "learning_rate": 0.0001423406781897348, + "loss": 0.0038, + "step": 2190 + }, + { + "epoch": 0.5787357458308615, + "grad_norm": 0.9866570830345154, + "learning_rate": 0.00014220873466156485, + "loss": 0.025, + "step": 2195 + }, + { + "epoch": 0.5800540504910685, + "grad_norm": 0.023570384830236435, + "learning_rate": 0.00014207679113339492, + "loss": 0.0468, + "step": 2200 + }, + { + "epoch": 0.5813723551512755, + "grad_norm": 0.20010559260845184, + "learning_rate": 0.00014194484760522496, + "loss": 0.0198, + "step": 2205 + }, + { + "epoch": 0.5826906598114824, + "grad_norm": 0.06153270602226257, + "learning_rate": 0.00014181290407705503, + "loss": 0.0764, + "step": 2210 + }, + { + "epoch": 0.5840089644716894, + "grad_norm": 0.033162448555231094, + "learning_rate": 0.0001416809605488851, + "loss": 0.028, + "step": 2215 + }, + { + "epoch": 0.5853272691318964, + "grad_norm": 0.428382933139801, + "learning_rate": 0.00014154901702071514, + "loss": 0.0652, + "step": 2220 + }, + { + "epoch": 0.5866455737921034, + "grad_norm": 0.25004762411117554, + "learning_rate": 0.0001414170734925452, + "loss": 0.0411, + "step": 2225 + }, + { + "epoch": 0.5879638784523104, + "grad_norm": 0.22649863362312317, + "learning_rate": 0.00014128512996437525, + "loss": 0.0517, + "step": 2230 + }, + { + "epoch": 0.5892821831125173, + "grad_norm": 0.035932112485170364, + "learning_rate": 0.00014115318643620532, + "loss": 0.015, + "step": 2235 + }, + { + "epoch": 0.5906004877727242, + "grad_norm": 0.3800172507762909, + "learning_rate": 0.00014102124290803536, + "loss": 0.0324, + "step": 2240 + }, + { + "epoch": 0.5919187924329312, + "grad_norm": 0.6974118947982788, + "learning_rate": 0.0001408892993798654, + "loss": 0.0216, + "step": 2245 + }, + { + "epoch": 0.5932370970931382, + "grad_norm": 0.15472032129764557, + "learning_rate": 0.00014075735585169548, + "loss": 0.0164, + "step": 2250 + }, + { + "epoch": 0.5945554017533452, + "grad_norm": 0.015000814571976662, + "learning_rate": 0.00014062541232352552, + "loss": 0.0395, + "step": 2255 + }, + { + "epoch": 0.5958737064135522, + "grad_norm": 0.052086081355810165, + "learning_rate": 0.0001404934687953556, + "loss": 0.0032, + "step": 2260 + }, + { + "epoch": 0.5971920110737592, + "grad_norm": 0.004600350745022297, + "learning_rate": 0.00014036152526718566, + "loss": 0.0056, + "step": 2265 + }, + { + "epoch": 0.5985103157339661, + "grad_norm": 0.4940958321094513, + "learning_rate": 0.0001402295817390157, + "loss": 0.0206, + "step": 2270 + }, + { + "epoch": 0.5998286203941731, + "grad_norm": 0.09658394008874893, + "learning_rate": 0.00014009763821084577, + "loss": 0.0052, + "step": 2275 + }, + { + "epoch": 0.60114692505438, + "grad_norm": 0.00020539117394946516, + "learning_rate": 0.0001399656946826758, + "loss": 0.087, + "step": 2280 + }, + { + "epoch": 0.602465229714587, + "grad_norm": 0.1871018409729004, + "learning_rate": 0.00013983375115450588, + "loss": 0.0812, + "step": 2285 + }, + { + "epoch": 0.603783534374794, + "grad_norm": 0.02583954855799675, + "learning_rate": 0.00013970180762633592, + "loss": 0.0232, + "step": 2290 + }, + { + "epoch": 0.605101839035001, + "grad_norm": 1.2103784084320068, + "learning_rate": 0.000139569864098166, + "loss": 0.0151, + "step": 2295 + }, + { + "epoch": 0.6064201436952079, + "grad_norm": 0.023514943197369576, + "learning_rate": 0.00013943792056999606, + "loss": 0.0193, + "step": 2300 + }, + { + "epoch": 0.6077384483554149, + "grad_norm": 0.0076395305804908276, + "learning_rate": 0.0001393059770418261, + "loss": 0.0379, + "step": 2305 + }, + { + "epoch": 0.6090567530156219, + "grad_norm": 0.12412039190530777, + "learning_rate": 0.00013917403351365617, + "loss": 0.0095, + "step": 2310 + }, + { + "epoch": 0.6103750576758289, + "grad_norm": 0.021904783323407173, + "learning_rate": 0.0001390420899854862, + "loss": 0.0166, + "step": 2315 + }, + { + "epoch": 0.6116933623360359, + "grad_norm": 0.004012851510196924, + "learning_rate": 0.00013891014645731628, + "loss": 0.0103, + "step": 2320 + }, + { + "epoch": 0.6130116669962429, + "grad_norm": 0.007267913781106472, + "learning_rate": 0.00013877820292914635, + "loss": 0.0708, + "step": 2325 + }, + { + "epoch": 0.6143299716564498, + "grad_norm": 0.10363642126321793, + "learning_rate": 0.0001386462594009764, + "loss": 0.0473, + "step": 2330 + }, + { + "epoch": 0.6156482763166568, + "grad_norm": 0.04899830371141434, + "learning_rate": 0.00013851431587280646, + "loss": 0.0283, + "step": 2335 + }, + { + "epoch": 0.6169665809768637, + "grad_norm": 0.39460498094558716, + "learning_rate": 0.0001383823723446365, + "loss": 0.0597, + "step": 2340 + }, + { + "epoch": 0.6182848856370707, + "grad_norm": 0.04092290997505188, + "learning_rate": 0.00013825042881646655, + "loss": 0.0167, + "step": 2345 + }, + { + "epoch": 0.6196031902972777, + "grad_norm": 0.2781132161617279, + "learning_rate": 0.00013811848528829662, + "loss": 0.0097, + "step": 2350 + }, + { + "epoch": 0.6209214949574847, + "grad_norm": 0.041443537920713425, + "learning_rate": 0.00013798654176012666, + "loss": 0.0226, + "step": 2355 + }, + { + "epoch": 0.6222397996176916, + "grad_norm": 0.1242462694644928, + "learning_rate": 0.00013785459823195673, + "loss": 0.0055, + "step": 2360 + }, + { + "epoch": 0.6235581042778986, + "grad_norm": 0.4440467357635498, + "learning_rate": 0.00013772265470378677, + "loss": 0.049, + "step": 2365 + }, + { + "epoch": 0.6248764089381056, + "grad_norm": 0.014354427345097065, + "learning_rate": 0.00013759071117561684, + "loss": 0.0327, + "step": 2370 + }, + { + "epoch": 0.6261947135983126, + "grad_norm": 0.011539973318576813, + "learning_rate": 0.0001374587676474469, + "loss": 0.0222, + "step": 2375 + }, + { + "epoch": 0.6275130182585196, + "grad_norm": 0.23539051413536072, + "learning_rate": 0.00013732682411927695, + "loss": 0.0816, + "step": 2380 + }, + { + "epoch": 0.6288313229187266, + "grad_norm": 0.26793941855430603, + "learning_rate": 0.00013719488059110702, + "loss": 0.0325, + "step": 2385 + }, + { + "epoch": 0.6301496275789334, + "grad_norm": 0.01662217453122139, + "learning_rate": 0.00013706293706293706, + "loss": 0.0221, + "step": 2390 + }, + { + "epoch": 0.6314679322391404, + "grad_norm": 0.30669671297073364, + "learning_rate": 0.00013693099353476713, + "loss": 0.026, + "step": 2395 + }, + { + "epoch": 0.6327862368993474, + "grad_norm": 0.03350894898176193, + "learning_rate": 0.00013679905000659717, + "loss": 0.0072, + "step": 2400 + }, + { + "epoch": 0.6341045415595544, + "grad_norm": 0.014983875676989555, + "learning_rate": 0.00013666710647842724, + "loss": 0.049, + "step": 2405 + }, + { + "epoch": 0.6354228462197614, + "grad_norm": 1.8989384174346924, + "learning_rate": 0.0001365351629502573, + "loss": 0.0335, + "step": 2410 + }, + { + "epoch": 0.6367411508799684, + "grad_norm": 0.030135562643408775, + "learning_rate": 0.00013640321942208735, + "loss": 0.0051, + "step": 2415 + }, + { + "epoch": 0.6380594555401753, + "grad_norm": 0.02079075388610363, + "learning_rate": 0.00013627127589391742, + "loss": 0.0138, + "step": 2420 + }, + { + "epoch": 0.6393777602003823, + "grad_norm": 0.06065403297543526, + "learning_rate": 0.00013613933236574746, + "loss": 0.0357, + "step": 2425 + }, + { + "epoch": 0.6406960648605893, + "grad_norm": 0.2980937659740448, + "learning_rate": 0.00013600738883757753, + "loss": 0.0138, + "step": 2430 + }, + { + "epoch": 0.6420143695207963, + "grad_norm": 0.4820438623428345, + "learning_rate": 0.00013587544530940758, + "loss": 0.01, + "step": 2435 + }, + { + "epoch": 0.6433326741810033, + "grad_norm": 0.005618259310722351, + "learning_rate": 0.00013574350178123765, + "loss": 0.0052, + "step": 2440 + }, + { + "epoch": 0.6446509788412103, + "grad_norm": 0.7173821926116943, + "learning_rate": 0.0001356115582530677, + "loss": 0.0133, + "step": 2445 + }, + { + "epoch": 0.6459692835014171, + "grad_norm": 0.0053142281249165535, + "learning_rate": 0.00013547961472489773, + "loss": 0.0045, + "step": 2450 + }, + { + "epoch": 0.6472875881616241, + "grad_norm": 0.06118829548358917, + "learning_rate": 0.0001353476711967278, + "loss": 0.056, + "step": 2455 + }, + { + "epoch": 0.6486058928218311, + "grad_norm": 3.5878078937530518, + "learning_rate": 0.00013521572766855787, + "loss": 0.0232, + "step": 2460 + }, + { + "epoch": 0.6499241974820381, + "grad_norm": 0.004911276511847973, + "learning_rate": 0.0001350837841403879, + "loss": 0.0074, + "step": 2465 + }, + { + "epoch": 0.6512425021422451, + "grad_norm": 0.0028026222717016935, + "learning_rate": 0.00013495184061221798, + "loss": 0.0782, + "step": 2470 + }, + { + "epoch": 0.6525608068024521, + "grad_norm": 0.7317615747451782, + "learning_rate": 0.00013481989708404802, + "loss": 0.0222, + "step": 2475 + }, + { + "epoch": 0.653879111462659, + "grad_norm": 0.01835751160979271, + "learning_rate": 0.0001346879535558781, + "loss": 0.0661, + "step": 2480 + }, + { + "epoch": 0.655197416122866, + "grad_norm": 0.03598962351679802, + "learning_rate": 0.00013455601002770813, + "loss": 0.0395, + "step": 2485 + }, + { + "epoch": 0.656515720783073, + "grad_norm": 0.013886351138353348, + "learning_rate": 0.0001344240664995382, + "loss": 0.0156, + "step": 2490 + }, + { + "epoch": 0.65783402544328, + "grad_norm": 5.741530895233154, + "learning_rate": 0.00013429212297136827, + "loss": 0.0317, + "step": 2495 + }, + { + "epoch": 0.659152330103487, + "grad_norm": 0.20793496072292328, + "learning_rate": 0.0001341601794431983, + "loss": 0.0072, + "step": 2500 + }, + { + "epoch": 0.659152330103487, + "eval_loss": 0.0300898440182209, + "eval_runtime": 453.0554, + "eval_samples_per_second": 7.443, + "eval_steps_per_second": 3.721, + "step": 2500 + }, + { + "epoch": 0.6604706347636939, + "grad_norm": 0.03460961952805519, + "learning_rate": 0.00013402823591502838, + "loss": 0.0097, + "step": 2505 + }, + { + "epoch": 0.6617889394239008, + "grad_norm": 0.31785696744918823, + "learning_rate": 0.00013389629238685842, + "loss": 0.0303, + "step": 2510 + }, + { + "epoch": 0.6631072440841078, + "grad_norm": 0.4273851215839386, + "learning_rate": 0.0001337643488586885, + "loss": 0.0499, + "step": 2515 + }, + { + "epoch": 0.6644255487443148, + "grad_norm": 0.02236153744161129, + "learning_rate": 0.00013363240533051856, + "loss": 0.0069, + "step": 2520 + }, + { + "epoch": 0.6657438534045218, + "grad_norm": 0.1592864990234375, + "learning_rate": 0.0001335004618023486, + "loss": 0.0326, + "step": 2525 + }, + { + "epoch": 0.6670621580647288, + "grad_norm": 0.029961545020341873, + "learning_rate": 0.00013336851827417867, + "loss": 0.0178, + "step": 2530 + }, + { + "epoch": 0.6683804627249358, + "grad_norm": 0.03120764158666134, + "learning_rate": 0.00013323657474600872, + "loss": 0.115, + "step": 2535 + }, + { + "epoch": 0.6696987673851427, + "grad_norm": 0.01060028001666069, + "learning_rate": 0.00013310463121783879, + "loss": 0.0036, + "step": 2540 + }, + { + "epoch": 0.6710170720453497, + "grad_norm": 0.053470809012651443, + "learning_rate": 0.00013297268768966883, + "loss": 0.0079, + "step": 2545 + }, + { + "epoch": 0.6723353767055567, + "grad_norm": 0.022777097299695015, + "learning_rate": 0.00013284074416149887, + "loss": 0.0078, + "step": 2550 + }, + { + "epoch": 0.6736536813657636, + "grad_norm": 0.0548521913588047, + "learning_rate": 0.00013270880063332894, + "loss": 0.0503, + "step": 2555 + }, + { + "epoch": 0.6749719860259706, + "grad_norm": 0.02028457075357437, + "learning_rate": 0.00013257685710515898, + "loss": 0.0096, + "step": 2560 + }, + { + "epoch": 0.6762902906861776, + "grad_norm": 0.01569107361137867, + "learning_rate": 0.00013244491357698905, + "loss": 0.008, + "step": 2565 + }, + { + "epoch": 0.6776085953463845, + "grad_norm": 0.00743742985650897, + "learning_rate": 0.00013231297004881912, + "loss": 0.005, + "step": 2570 + }, + { + "epoch": 0.6789269000065915, + "grad_norm": 0.025164416059851646, + "learning_rate": 0.00013218102652064916, + "loss": 0.018, + "step": 2575 + }, + { + "epoch": 0.6802452046667985, + "grad_norm": 0.3653188645839691, + "learning_rate": 0.00013204908299247923, + "loss": 0.0295, + "step": 2580 + }, + { + "epoch": 0.6815635093270055, + "grad_norm": 0.685422956943512, + "learning_rate": 0.00013191713946430927, + "loss": 0.0335, + "step": 2585 + }, + { + "epoch": 0.6828818139872125, + "grad_norm": 0.675740122795105, + "learning_rate": 0.00013178519593613934, + "loss": 0.0592, + "step": 2590 + }, + { + "epoch": 0.6842001186474194, + "grad_norm": 0.10513252764940262, + "learning_rate": 0.00013165325240796938, + "loss": 0.0353, + "step": 2595 + }, + { + "epoch": 0.6855184233076264, + "grad_norm": 0.43512973189353943, + "learning_rate": 0.00013152130887979945, + "loss": 0.0142, + "step": 2600 + }, + { + "epoch": 0.6868367279678333, + "grad_norm": 0.029436839744448662, + "learning_rate": 0.00013138936535162952, + "loss": 0.0042, + "step": 2605 + }, + { + "epoch": 0.6881550326280403, + "grad_norm": 0.5607122778892517, + "learning_rate": 0.00013125742182345957, + "loss": 0.0184, + "step": 2610 + }, + { + "epoch": 0.6894733372882473, + "grad_norm": 0.11365406215190887, + "learning_rate": 0.00013112547829528963, + "loss": 0.006, + "step": 2615 + }, + { + "epoch": 0.6907916419484543, + "grad_norm": 0.047227244824171066, + "learning_rate": 0.00013099353476711968, + "loss": 0.008, + "step": 2620 + }, + { + "epoch": 0.6921099466086612, + "grad_norm": 0.0005877618095837533, + "learning_rate": 0.00013086159123894975, + "loss": 0.0286, + "step": 2625 + }, + { + "epoch": 0.6934282512688682, + "grad_norm": 0.010759112425148487, + "learning_rate": 0.0001307296477107798, + "loss": 0.0062, + "step": 2630 + }, + { + "epoch": 0.6947465559290752, + "grad_norm": 0.07117745280265808, + "learning_rate": 0.00013059770418260986, + "loss": 0.0891, + "step": 2635 + }, + { + "epoch": 0.6960648605892822, + "grad_norm": 0.0639057606458664, + "learning_rate": 0.00013046576065443993, + "loss": 0.0072, + "step": 2640 + }, + { + "epoch": 0.6973831652494892, + "grad_norm": 0.027350090444087982, + "learning_rate": 0.00013033381712626994, + "loss": 0.0103, + "step": 2645 + }, + { + "epoch": 0.6987014699096962, + "grad_norm": 0.015336195938289165, + "learning_rate": 0.0001302018735981, + "loss": 0.0041, + "step": 2650 + }, + { + "epoch": 0.700019774569903, + "grad_norm": 1.0650830268859863, + "learning_rate": 0.00013006993006993008, + "loss": 0.0443, + "step": 2655 + }, + { + "epoch": 0.70133807923011, + "grad_norm": 0.019073212519288063, + "learning_rate": 0.00012993798654176012, + "loss": 0.0331, + "step": 2660 + }, + { + "epoch": 0.702656383890317, + "grad_norm": 0.10109209269285202, + "learning_rate": 0.0001298060430135902, + "loss": 0.0054, + "step": 2665 + }, + { + "epoch": 0.703974688550524, + "grad_norm": 0.03528957813978195, + "learning_rate": 0.00012967409948542023, + "loss": 0.0427, + "step": 2670 + }, + { + "epoch": 0.705292993210731, + "grad_norm": 0.03577788919210434, + "learning_rate": 0.0001295421559572503, + "loss": 0.023, + "step": 2675 + }, + { + "epoch": 0.706611297870938, + "grad_norm": 0.5576180815696716, + "learning_rate": 0.00012941021242908034, + "loss": 0.0416, + "step": 2680 + }, + { + "epoch": 0.7079296025311449, + "grad_norm": 0.017131298780441284, + "learning_rate": 0.0001292782689009104, + "loss": 0.0235, + "step": 2685 + }, + { + "epoch": 0.7092479071913519, + "grad_norm": 0.8517888784408569, + "learning_rate": 0.00012914632537274048, + "loss": 0.0168, + "step": 2690 + }, + { + "epoch": 0.7105662118515589, + "grad_norm": 0.23812156915664673, + "learning_rate": 0.00012901438184457052, + "loss": 0.0483, + "step": 2695 + }, + { + "epoch": 0.7118845165117659, + "grad_norm": 0.11746613681316376, + "learning_rate": 0.0001288824383164006, + "loss": 0.0255, + "step": 2700 + }, + { + "epoch": 0.7132028211719729, + "grad_norm": 0.20089928805828094, + "learning_rate": 0.00012875049478823064, + "loss": 0.0267, + "step": 2705 + }, + { + "epoch": 0.7145211258321799, + "grad_norm": 0.8301129937171936, + "learning_rate": 0.0001286185512600607, + "loss": 0.016, + "step": 2710 + }, + { + "epoch": 0.7158394304923867, + "grad_norm": 0.01838674768805504, + "learning_rate": 0.00012848660773189077, + "loss": 0.0229, + "step": 2715 + }, + { + "epoch": 0.7171577351525937, + "grad_norm": 0.03670337051153183, + "learning_rate": 0.00012835466420372082, + "loss": 0.038, + "step": 2720 + }, + { + "epoch": 0.7184760398128007, + "grad_norm": 0.0452633760869503, + "learning_rate": 0.00012822272067555089, + "loss": 0.0622, + "step": 2725 + }, + { + "epoch": 0.7197943444730077, + "grad_norm": 0.09503110498189926, + "learning_rate": 0.00012809077714738093, + "loss": 0.0209, + "step": 2730 + }, + { + "epoch": 0.7211126491332147, + "grad_norm": 1.0327308177947998, + "learning_rate": 0.000127958833619211, + "loss": 0.0361, + "step": 2735 + }, + { + "epoch": 0.7224309537934217, + "grad_norm": 1.0049290657043457, + "learning_rate": 0.00012782689009104104, + "loss": 0.0365, + "step": 2740 + }, + { + "epoch": 0.7237492584536286, + "grad_norm": 0.029774073511362076, + "learning_rate": 0.00012769494656287108, + "loss": 0.0257, + "step": 2745 + }, + { + "epoch": 0.7250675631138356, + "grad_norm": 0.20974040031433105, + "learning_rate": 0.00012756300303470115, + "loss": 0.0542, + "step": 2750 + }, + { + "epoch": 0.7263858677740426, + "grad_norm": 0.8153854608535767, + "learning_rate": 0.0001274310595065312, + "loss": 0.0216, + "step": 2755 + }, + { + "epoch": 0.7277041724342496, + "grad_norm": 0.4393698573112488, + "learning_rate": 0.00012729911597836126, + "loss": 0.0451, + "step": 2760 + }, + { + "epoch": 0.7290224770944566, + "grad_norm": 0.06990349292755127, + "learning_rate": 0.00012716717245019133, + "loss": 0.03, + "step": 2765 + }, + { + "epoch": 0.7303407817546635, + "grad_norm": 0.32689470052719116, + "learning_rate": 0.00012703522892202137, + "loss": 0.0263, + "step": 2770 + }, + { + "epoch": 0.7316590864148704, + "grad_norm": 0.026600876823067665, + "learning_rate": 0.00012690328539385144, + "loss": 0.0404, + "step": 2775 + }, + { + "epoch": 0.7329773910750774, + "grad_norm": 0.11228257417678833, + "learning_rate": 0.00012677134186568148, + "loss": 0.0224, + "step": 2780 + }, + { + "epoch": 0.7342956957352844, + "grad_norm": 0.6469443440437317, + "learning_rate": 0.00012663939833751155, + "loss": 0.0178, + "step": 2785 + }, + { + "epoch": 0.7356140003954914, + "grad_norm": 0.020773250609636307, + "learning_rate": 0.0001265074548093416, + "loss": 0.011, + "step": 2790 + }, + { + "epoch": 0.7369323050556984, + "grad_norm": 0.7378728985786438, + "learning_rate": 0.00012637551128117167, + "loss": 0.0227, + "step": 2795 + }, + { + "epoch": 0.7382506097159054, + "grad_norm": 0.008189595304429531, + "learning_rate": 0.00012624356775300173, + "loss": 0.0892, + "step": 2800 + }, + { + "epoch": 0.7395689143761123, + "grad_norm": 0.031633853912353516, + "learning_rate": 0.00012611162422483178, + "loss": 0.0093, + "step": 2805 + }, + { + "epoch": 0.7408872190363193, + "grad_norm": 0.5078475475311279, + "learning_rate": 0.00012597968069666185, + "loss": 0.0567, + "step": 2810 + }, + { + "epoch": 0.7422055236965263, + "grad_norm": 0.21766887605190277, + "learning_rate": 0.0001258477371684919, + "loss": 0.0485, + "step": 2815 + }, + { + "epoch": 0.7435238283567333, + "grad_norm": 0.3029612898826599, + "learning_rate": 0.00012571579364032196, + "loss": 0.032, + "step": 2820 + }, + { + "epoch": 0.7448421330169402, + "grad_norm": 1.2135159969329834, + "learning_rate": 0.00012558385011215203, + "loss": 0.0139, + "step": 2825 + }, + { + "epoch": 0.7461604376771472, + "grad_norm": 0.016875172033905983, + "learning_rate": 0.00012545190658398207, + "loss": 0.0323, + "step": 2830 + }, + { + "epoch": 0.7474787423373541, + "grad_norm": 0.08923230320215225, + "learning_rate": 0.00012531996305581214, + "loss": 0.0343, + "step": 2835 + }, + { + "epoch": 0.7487970469975611, + "grad_norm": 0.2958766520023346, + "learning_rate": 0.00012518801952764215, + "loss": 0.0431, + "step": 2840 + }, + { + "epoch": 0.7501153516577681, + "grad_norm": 0.7344386577606201, + "learning_rate": 0.00012505607599947222, + "loss": 0.0389, + "step": 2845 + }, + { + "epoch": 0.7514336563179751, + "grad_norm": 0.03681635856628418, + "learning_rate": 0.0001249241324713023, + "loss": 0.0258, + "step": 2850 + }, + { + "epoch": 0.7527519609781821, + "grad_norm": 0.22866861522197723, + "learning_rate": 0.00012479218894313233, + "loss": 0.0223, + "step": 2855 + }, + { + "epoch": 0.7540702656383891, + "grad_norm": 0.029770435765385628, + "learning_rate": 0.0001246602454149624, + "loss": 0.0205, + "step": 2860 + }, + { + "epoch": 0.755388570298596, + "grad_norm": 0.011845707893371582, + "learning_rate": 0.00012452830188679244, + "loss": 0.0252, + "step": 2865 + }, + { + "epoch": 0.756706874958803, + "grad_norm": 0.06696149706840515, + "learning_rate": 0.00012439635835862251, + "loss": 0.0166, + "step": 2870 + }, + { + "epoch": 0.75802517961901, + "grad_norm": 0.01653144136071205, + "learning_rate": 0.00012426441483045256, + "loss": 0.0487, + "step": 2875 + }, + { + "epoch": 0.7593434842792169, + "grad_norm": 0.031312476843595505, + "learning_rate": 0.00012413247130228263, + "loss": 0.0155, + "step": 2880 + }, + { + "epoch": 0.7606617889394239, + "grad_norm": 0.011625733226537704, + "learning_rate": 0.0001240005277741127, + "loss": 0.0333, + "step": 2885 + }, + { + "epoch": 0.7619800935996309, + "grad_norm": 0.012089414522051811, + "learning_rate": 0.00012386858424594274, + "loss": 0.003, + "step": 2890 + }, + { + "epoch": 0.7632983982598378, + "grad_norm": 0.3012307584285736, + "learning_rate": 0.0001237366407177728, + "loss": 0.0172, + "step": 2895 + }, + { + "epoch": 0.7646167029200448, + "grad_norm": 0.31575000286102295, + "learning_rate": 0.00012360469718960285, + "loss": 0.0409, + "step": 2900 + }, + { + "epoch": 0.7659350075802518, + "grad_norm": 0.009794364683330059, + "learning_rate": 0.00012347275366143292, + "loss": 0.0214, + "step": 2905 + }, + { + "epoch": 0.7672533122404588, + "grad_norm": 0.5973085165023804, + "learning_rate": 0.00012334081013326299, + "loss": 0.0245, + "step": 2910 + }, + { + "epoch": 0.7685716169006658, + "grad_norm": 0.019750040024518967, + "learning_rate": 0.00012320886660509303, + "loss": 0.0063, + "step": 2915 + }, + { + "epoch": 0.7698899215608728, + "grad_norm": 0.06402858346700668, + "learning_rate": 0.0001230769230769231, + "loss": 0.0444, + "step": 2920 + }, + { + "epoch": 0.7712082262210797, + "grad_norm": 0.02876671403646469, + "learning_rate": 0.00012294497954875314, + "loss": 0.0103, + "step": 2925 + }, + { + "epoch": 0.7725265308812866, + "grad_norm": 0.6962207555770874, + "learning_rate": 0.0001228130360205832, + "loss": 0.0318, + "step": 2930 + }, + { + "epoch": 0.7738448355414936, + "grad_norm": 0.006536522414535284, + "learning_rate": 0.00012268109249241325, + "loss": 0.0096, + "step": 2935 + }, + { + "epoch": 0.7751631402017006, + "grad_norm": 0.07097168266773224, + "learning_rate": 0.0001225491489642433, + "loss": 0.0174, + "step": 2940 + }, + { + "epoch": 0.7764814448619076, + "grad_norm": 0.042360126972198486, + "learning_rate": 0.00012241720543607336, + "loss": 0.0158, + "step": 2945 + }, + { + "epoch": 0.7777997495221146, + "grad_norm": 0.01159572321921587, + "learning_rate": 0.0001222852619079034, + "loss": 0.0265, + "step": 2950 + }, + { + "epoch": 0.7791180541823215, + "grad_norm": 0.38408163189888, + "learning_rate": 0.00012215331837973347, + "loss": 0.0233, + "step": 2955 + }, + { + "epoch": 0.7804363588425285, + "grad_norm": 0.15588605403900146, + "learning_rate": 0.00012202137485156353, + "loss": 0.0041, + "step": 2960 + }, + { + "epoch": 0.7817546635027355, + "grad_norm": 0.006892362609505653, + "learning_rate": 0.00012188943132339358, + "loss": 0.0026, + "step": 2965 + }, + { + "epoch": 0.7830729681629425, + "grad_norm": 0.030915727838873863, + "learning_rate": 0.00012175748779522364, + "loss": 0.0028, + "step": 2970 + }, + { + "epoch": 0.7843912728231495, + "grad_norm": 0.8151025772094727, + "learning_rate": 0.00012162554426705371, + "loss": 0.0429, + "step": 2975 + }, + { + "epoch": 0.7857095774833565, + "grad_norm": 0.6765475273132324, + "learning_rate": 0.00012149360073888377, + "loss": 0.0319, + "step": 2980 + }, + { + "epoch": 0.7870278821435633, + "grad_norm": 0.054469238966703415, + "learning_rate": 0.00012136165721071382, + "loss": 0.0413, + "step": 2985 + }, + { + "epoch": 0.7883461868037703, + "grad_norm": 0.045610666275024414, + "learning_rate": 0.00012122971368254388, + "loss": 0.0521, + "step": 2990 + }, + { + "epoch": 0.7896644914639773, + "grad_norm": 0.4222470223903656, + "learning_rate": 0.00012109777015437393, + "loss": 0.0846, + "step": 2995 + }, + { + "epoch": 0.7909827961241843, + "grad_norm": 0.0272397268563509, + "learning_rate": 0.00012096582662620399, + "loss": 0.0364, + "step": 3000 + }, + { + "epoch": 0.7909827961241843, + "eval_loss": 0.033312585204839706, + "eval_runtime": 452.2552, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 3.728, + "step": 3000 + }, + { + "epoch": 0.7923011007843913, + "grad_norm": 0.08674059063196182, + "learning_rate": 0.00012083388309803406, + "loss": 0.0081, + "step": 3005 + }, + { + "epoch": 0.7936194054445982, + "grad_norm": 0.21960832178592682, + "learning_rate": 0.00012070193956986411, + "loss": 0.0468, + "step": 3010 + }, + { + "epoch": 0.7949377101048052, + "grad_norm": 0.11259289085865021, + "learning_rate": 0.00012056999604169417, + "loss": 0.0124, + "step": 3015 + }, + { + "epoch": 0.7962560147650122, + "grad_norm": 0.02945362776517868, + "learning_rate": 0.00012043805251352422, + "loss": 0.0298, + "step": 3020 + }, + { + "epoch": 0.7975743194252192, + "grad_norm": 0.27889615297317505, + "learning_rate": 0.00012030610898535428, + "loss": 0.0251, + "step": 3025 + }, + { + "epoch": 0.7988926240854262, + "grad_norm": 0.05873241275548935, + "learning_rate": 0.00012017416545718434, + "loss": 0.0132, + "step": 3030 + }, + { + "epoch": 0.8002109287456332, + "grad_norm": 0.1570046991109848, + "learning_rate": 0.00012004222192901439, + "loss": 0.0228, + "step": 3035 + }, + { + "epoch": 0.80152923340584, + "grad_norm": 0.12575332820415497, + "learning_rate": 0.00011991027840084443, + "loss": 0.0049, + "step": 3040 + }, + { + "epoch": 0.802847538066047, + "grad_norm": 0.8416435122489929, + "learning_rate": 0.00011977833487267449, + "loss": 0.0542, + "step": 3045 + }, + { + "epoch": 0.804165842726254, + "grad_norm": 0.2605098485946655, + "learning_rate": 0.00011964639134450454, + "loss": 0.0084, + "step": 3050 + }, + { + "epoch": 0.805484147386461, + "grad_norm": 0.8996294736862183, + "learning_rate": 0.00011951444781633461, + "loss": 0.0442, + "step": 3055 + }, + { + "epoch": 0.806802452046668, + "grad_norm": 2.7525105476379395, + "learning_rate": 0.00011938250428816467, + "loss": 0.0642, + "step": 3060 + }, + { + "epoch": 0.808120756706875, + "grad_norm": 0.14955930411815643, + "learning_rate": 0.00011925056075999473, + "loss": 0.0384, + "step": 3065 + }, + { + "epoch": 0.8094390613670819, + "grad_norm": 0.018756115809082985, + "learning_rate": 0.00011911861723182478, + "loss": 0.0154, + "step": 3070 + }, + { + "epoch": 0.8107573660272889, + "grad_norm": 0.23998615145683289, + "learning_rate": 0.00011898667370365484, + "loss": 0.0413, + "step": 3075 + }, + { + "epoch": 0.8120756706874959, + "grad_norm": 0.27253249287605286, + "learning_rate": 0.00011885473017548489, + "loss": 0.0081, + "step": 3080 + }, + { + "epoch": 0.8133939753477029, + "grad_norm": 0.2925993502140045, + "learning_rate": 0.00011872278664731495, + "loss": 0.0332, + "step": 3085 + }, + { + "epoch": 0.8147122800079099, + "grad_norm": 0.5364832878112793, + "learning_rate": 0.00011859084311914502, + "loss": 0.0143, + "step": 3090 + }, + { + "epoch": 0.8160305846681168, + "grad_norm": 0.32104921340942383, + "learning_rate": 0.00011845889959097507, + "loss": 0.0216, + "step": 3095 + }, + { + "epoch": 0.8173488893283237, + "grad_norm": 0.0205856766551733, + "learning_rate": 0.00011832695606280513, + "loss": 0.0346, + "step": 3100 + }, + { + "epoch": 0.8186671939885307, + "grad_norm": 0.2541547417640686, + "learning_rate": 0.00011819501253463518, + "loss": 0.0793, + "step": 3105 + }, + { + "epoch": 0.8199854986487377, + "grad_norm": 0.08333491533994675, + "learning_rate": 0.00011806306900646524, + "loss": 0.0049, + "step": 3110 + }, + { + "epoch": 0.8213038033089447, + "grad_norm": 0.0355968177318573, + "learning_rate": 0.0001179311254782953, + "loss": 0.0051, + "step": 3115 + }, + { + "epoch": 0.8226221079691517, + "grad_norm": 0.06948401033878326, + "learning_rate": 0.00011779918195012536, + "loss": 0.013, + "step": 3120 + }, + { + "epoch": 0.8239404126293587, + "grad_norm": 0.03328891843557358, + "learning_rate": 0.00011766723842195542, + "loss": 0.0122, + "step": 3125 + }, + { + "epoch": 0.8252587172895656, + "grad_norm": 0.013782350346446037, + "learning_rate": 0.00011753529489378548, + "loss": 0.0073, + "step": 3130 + }, + { + "epoch": 0.8265770219497726, + "grad_norm": 0.024390392005443573, + "learning_rate": 0.00011740335136561553, + "loss": 0.0143, + "step": 3135 + }, + { + "epoch": 0.8278953266099796, + "grad_norm": 0.002548128366470337, + "learning_rate": 0.00011727140783744557, + "loss": 0.0027, + "step": 3140 + }, + { + "epoch": 0.8292136312701865, + "grad_norm": 0.11674848943948746, + "learning_rate": 0.00011713946430927563, + "loss": 0.0253, + "step": 3145 + }, + { + "epoch": 0.8305319359303935, + "grad_norm": 0.005774884019047022, + "learning_rate": 0.00011700752078110568, + "loss": 0.0018, + "step": 3150 + }, + { + "epoch": 0.8318502405906005, + "grad_norm": 0.5763069987297058, + "learning_rate": 0.00011687557725293574, + "loss": 0.0119, + "step": 3155 + }, + { + "epoch": 0.8331685452508074, + "grad_norm": 0.0027607593219727278, + "learning_rate": 0.0001167436337247658, + "loss": 0.0279, + "step": 3160 + }, + { + "epoch": 0.8344868499110144, + "grad_norm": 1.859642505645752, + "learning_rate": 0.00011661169019659585, + "loss": 0.0228, + "step": 3165 + }, + { + "epoch": 0.8358051545712214, + "grad_norm": 0.16597022116184235, + "learning_rate": 0.00011647974666842592, + "loss": 0.1228, + "step": 3170 + }, + { + "epoch": 0.8371234592314284, + "grad_norm": 0.33833742141723633, + "learning_rate": 0.00011634780314025598, + "loss": 0.073, + "step": 3175 + }, + { + "epoch": 0.8384417638916354, + "grad_norm": 0.024682912975549698, + "learning_rate": 0.00011621585961208603, + "loss": 0.0042, + "step": 3180 + }, + { + "epoch": 0.8397600685518424, + "grad_norm": 0.05926942452788353, + "learning_rate": 0.00011608391608391609, + "loss": 0.0066, + "step": 3185 + }, + { + "epoch": 0.8410783732120493, + "grad_norm": 0.1414029747247696, + "learning_rate": 0.00011595197255574614, + "loss": 0.0603, + "step": 3190 + }, + { + "epoch": 0.8423966778722562, + "grad_norm": 0.37928736209869385, + "learning_rate": 0.0001158200290275762, + "loss": 0.0266, + "step": 3195 + }, + { + "epoch": 0.8437149825324632, + "grad_norm": 0.018329354003071785, + "learning_rate": 0.00011568808549940627, + "loss": 0.0047, + "step": 3200 + }, + { + "epoch": 0.8450332871926702, + "grad_norm": 0.2993735373020172, + "learning_rate": 0.00011555614197123632, + "loss": 0.0218, + "step": 3205 + }, + { + "epoch": 0.8463515918528772, + "grad_norm": 0.1767728328704834, + "learning_rate": 0.00011542419844306638, + "loss": 0.0363, + "step": 3210 + }, + { + "epoch": 0.8476698965130842, + "grad_norm": 0.39774414896965027, + "learning_rate": 0.00011529225491489644, + "loss": 0.0506, + "step": 3215 + }, + { + "epoch": 0.8489882011732911, + "grad_norm": 0.021896762773394585, + "learning_rate": 0.00011516031138672649, + "loss": 0.0081, + "step": 3220 + }, + { + "epoch": 0.8503065058334981, + "grad_norm": 0.358372300863266, + "learning_rate": 0.00011502836785855655, + "loss": 0.0224, + "step": 3225 + }, + { + "epoch": 0.8516248104937051, + "grad_norm": 0.01605542004108429, + "learning_rate": 0.00011489642433038662, + "loss": 0.0215, + "step": 3230 + }, + { + "epoch": 0.8529431151539121, + "grad_norm": 0.021189266815781593, + "learning_rate": 0.00011476448080221667, + "loss": 0.0051, + "step": 3235 + }, + { + "epoch": 0.8542614198141191, + "grad_norm": 0.013394076377153397, + "learning_rate": 0.0001146325372740467, + "loss": 0.021, + "step": 3240 + }, + { + "epoch": 0.8555797244743261, + "grad_norm": 0.19848507642745972, + "learning_rate": 0.00011450059374587676, + "loss": 0.0285, + "step": 3245 + }, + { + "epoch": 0.856898029134533, + "grad_norm": 0.2463046759366989, + "learning_rate": 0.00011436865021770683, + "loss": 0.0384, + "step": 3250 + }, + { + "epoch": 0.8582163337947399, + "grad_norm": 0.37432390451431274, + "learning_rate": 0.00011423670668953688, + "loss": 0.0098, + "step": 3255 + }, + { + "epoch": 0.8595346384549469, + "grad_norm": 0.060943394899368286, + "learning_rate": 0.00011410476316136694, + "loss": 0.0087, + "step": 3260 + }, + { + "epoch": 0.8608529431151539, + "grad_norm": 0.2846696674823761, + "learning_rate": 0.00011397281963319699, + "loss": 0.0148, + "step": 3265 + }, + { + "epoch": 0.8621712477753609, + "grad_norm": 0.009311323054134846, + "learning_rate": 0.00011384087610502705, + "loss": 0.0024, + "step": 3270 + }, + { + "epoch": 0.8634895524355679, + "grad_norm": 0.046277035027742386, + "learning_rate": 0.0001137089325768571, + "loss": 0.0274, + "step": 3275 + }, + { + "epoch": 0.8648078570957748, + "grad_norm": 0.006024620030075312, + "learning_rate": 0.00011357698904868716, + "loss": 0.0286, + "step": 3280 + }, + { + "epoch": 0.8661261617559818, + "grad_norm": 0.033578380942344666, + "learning_rate": 0.00011344504552051723, + "loss": 0.0153, + "step": 3285 + }, + { + "epoch": 0.8674444664161888, + "grad_norm": 0.8537917137145996, + "learning_rate": 0.00011331310199234728, + "loss": 0.0304, + "step": 3290 + }, + { + "epoch": 0.8687627710763958, + "grad_norm": 0.013933337293565273, + "learning_rate": 0.00011318115846417734, + "loss": 0.0112, + "step": 3295 + }, + { + "epoch": 0.8700810757366028, + "grad_norm": 0.35437721014022827, + "learning_rate": 0.0001130492149360074, + "loss": 0.0228, + "step": 3300 + }, + { + "epoch": 0.8713993803968098, + "grad_norm": 1.3024121522903442, + "learning_rate": 0.00011291727140783745, + "loss": 0.0203, + "step": 3305 + }, + { + "epoch": 0.8727176850570166, + "grad_norm": 0.5131255984306335, + "learning_rate": 0.00011278532787966751, + "loss": 0.0181, + "step": 3310 + }, + { + "epoch": 0.8740359897172236, + "grad_norm": 0.039366886019706726, + "learning_rate": 0.00011265338435149758, + "loss": 0.0192, + "step": 3315 + }, + { + "epoch": 0.8753542943774306, + "grad_norm": 0.13679669797420502, + "learning_rate": 0.00011252144082332763, + "loss": 0.004, + "step": 3320 + }, + { + "epoch": 0.8766725990376376, + "grad_norm": 0.003076886525377631, + "learning_rate": 0.00011238949729515769, + "loss": 0.0405, + "step": 3325 + }, + { + "epoch": 0.8779909036978446, + "grad_norm": 0.019953785464167595, + "learning_rate": 0.00011225755376698774, + "loss": 0.0241, + "step": 3330 + }, + { + "epoch": 0.8793092083580516, + "grad_norm": 0.007980377413332462, + "learning_rate": 0.0001121256102388178, + "loss": 0.0064, + "step": 3335 + }, + { + "epoch": 0.8806275130182585, + "grad_norm": 0.018761295825242996, + "learning_rate": 0.00011199366671064784, + "loss": 0.0032, + "step": 3340 + }, + { + "epoch": 0.8819458176784655, + "grad_norm": 0.022511709481477737, + "learning_rate": 0.0001118617231824779, + "loss": 0.0055, + "step": 3345 + }, + { + "epoch": 0.8832641223386725, + "grad_norm": 0.021270718425512314, + "learning_rate": 0.00011172977965430795, + "loss": 0.033, + "step": 3350 + }, + { + "epoch": 0.8845824269988795, + "grad_norm": 0.02710561640560627, + "learning_rate": 0.00011159783612613801, + "loss": 0.0094, + "step": 3355 + }, + { + "epoch": 0.8859007316590864, + "grad_norm": 0.4353378117084503, + "learning_rate": 0.00011146589259796806, + "loss": 0.0089, + "step": 3360 + }, + { + "epoch": 0.8872190363192934, + "grad_norm": 0.0257766991853714, + "learning_rate": 0.00011133394906979813, + "loss": 0.0059, + "step": 3365 + }, + { + "epoch": 0.8885373409795003, + "grad_norm": 0.80838942527771, + "learning_rate": 0.00011120200554162819, + "loss": 0.0263, + "step": 3370 + }, + { + "epoch": 0.8898556456397073, + "grad_norm": 0.007799761835485697, + "learning_rate": 0.00011107006201345824, + "loss": 0.0028, + "step": 3375 + }, + { + "epoch": 0.8911739502999143, + "grad_norm": 0.007315775845199823, + "learning_rate": 0.0001109381184852883, + "loss": 0.0127, + "step": 3380 + }, + { + "epoch": 0.8924922549601213, + "grad_norm": 1.4861233234405518, + "learning_rate": 0.00011080617495711836, + "loss": 0.0562, + "step": 3385 + }, + { + "epoch": 0.8938105596203283, + "grad_norm": 0.010219530202448368, + "learning_rate": 0.00011067423142894841, + "loss": 0.0438, + "step": 3390 + }, + { + "epoch": 0.8951288642805353, + "grad_norm": 1.0191857814788818, + "learning_rate": 0.00011054228790077848, + "loss": 0.0493, + "step": 3395 + }, + { + "epoch": 0.8964471689407422, + "grad_norm": 0.01459536887705326, + "learning_rate": 0.00011041034437260854, + "loss": 0.0117, + "step": 3400 + }, + { + "epoch": 0.8977654736009492, + "grad_norm": 0.008682495914399624, + "learning_rate": 0.00011027840084443859, + "loss": 0.02, + "step": 3405 + }, + { + "epoch": 0.8990837782611562, + "grad_norm": 0.02197263017296791, + "learning_rate": 0.00011014645731626865, + "loss": 0.0454, + "step": 3410 + }, + { + "epoch": 0.9004020829213631, + "grad_norm": 0.01436714269220829, + "learning_rate": 0.0001100145137880987, + "loss": 0.0283, + "step": 3415 + }, + { + "epoch": 0.9017203875815701, + "grad_norm": 0.14327946305274963, + "learning_rate": 0.00010988257025992876, + "loss": 0.0461, + "step": 3420 + }, + { + "epoch": 0.9030386922417771, + "grad_norm": 1.671773910522461, + "learning_rate": 0.00010975062673175883, + "loss": 0.054, + "step": 3425 + }, + { + "epoch": 0.904356996901984, + "grad_norm": 0.009926804341375828, + "learning_rate": 0.00010961868320358888, + "loss": 0.0429, + "step": 3430 + }, + { + "epoch": 0.905675301562191, + "grad_norm": 0.554020881652832, + "learning_rate": 0.00010948673967541894, + "loss": 0.0618, + "step": 3435 + }, + { + "epoch": 0.906993606222398, + "grad_norm": 0.1399248093366623, + "learning_rate": 0.00010935479614724897, + "loss": 0.0229, + "step": 3440 + }, + { + "epoch": 0.908311910882605, + "grad_norm": 0.02739197015762329, + "learning_rate": 0.00010922285261907904, + "loss": 0.0082, + "step": 3445 + }, + { + "epoch": 0.909630215542812, + "grad_norm": 0.33394527435302734, + "learning_rate": 0.00010909090909090909, + "loss": 0.0403, + "step": 3450 + }, + { + "epoch": 0.9109485202030189, + "grad_norm": 0.08083894103765488, + "learning_rate": 0.00010895896556273915, + "loss": 0.0406, + "step": 3455 + }, + { + "epoch": 0.9122668248632259, + "grad_norm": 0.39336663484573364, + "learning_rate": 0.0001088270220345692, + "loss": 0.02, + "step": 3460 + }, + { + "epoch": 0.9135851295234328, + "grad_norm": 0.20481553673744202, + "learning_rate": 0.00010869507850639926, + "loss": 0.0221, + "step": 3465 + }, + { + "epoch": 0.9149034341836398, + "grad_norm": 1.4507408142089844, + "learning_rate": 0.00010856313497822932, + "loss": 0.0357, + "step": 3470 + }, + { + "epoch": 0.9162217388438468, + "grad_norm": 0.2678806483745575, + "learning_rate": 0.00010843119145005937, + "loss": 0.0181, + "step": 3475 + }, + { + "epoch": 0.9175400435040538, + "grad_norm": 0.007361674215644598, + "learning_rate": 0.00010829924792188944, + "loss": 0.0978, + "step": 3480 + }, + { + "epoch": 0.9188583481642607, + "grad_norm": 0.773695707321167, + "learning_rate": 0.0001081673043937195, + "loss": 0.0401, + "step": 3485 + }, + { + "epoch": 0.9201766528244677, + "grad_norm": 0.0010772625682875514, + "learning_rate": 0.00010803536086554955, + "loss": 0.0233, + "step": 3490 + }, + { + "epoch": 0.9214949574846747, + "grad_norm": 0.08971104770898819, + "learning_rate": 0.00010790341733737961, + "loss": 0.0319, + "step": 3495 + }, + { + "epoch": 0.9228132621448817, + "grad_norm": 0.21372731029987335, + "learning_rate": 0.00010777147380920966, + "loss": 0.0315, + "step": 3500 + }, + { + "epoch": 0.9228132621448817, + "eval_loss": 0.02952708676457405, + "eval_runtime": 451.5837, + "eval_samples_per_second": 7.467, + "eval_steps_per_second": 3.734, + "step": 3500 + } + ], + "logging_steps": 5, + "max_steps": 7584, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.8245695227532083e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3500/training_args.bin b/checkpoint-3500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4c05e0585b1aef03c0cbe4c50207ce04940bd838 --- /dev/null +++ b/checkpoint-3500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79dfa687fdd0c9908ab6b63535817e7567b29b0b483ac228723218f6f5fdeec5 +size 5688 diff --git a/checkpoint-4000/README.md b/checkpoint-4000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e738b5eda9883f4a45efd9d37b8b8357ba758456 --- /dev/null +++ b/checkpoint-4000/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/Llama-3.2-3B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-4000/adapter_config.json b/checkpoint-4000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..852d02d4dc1204f4332be8b3363041a3d3e3feb3 --- /dev/null +++ b/checkpoint-4000/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/Llama-3.2-3B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "gate_proj", + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4000/adapter_model.safetensors b/checkpoint-4000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..56fa1b5fda365726d3342b50f7cda1b5c000e324 --- /dev/null +++ b/checkpoint-4000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:143a65d15d1f344b9a6f7a4952fe642d31d1e27ff6cfb5383afeecdf30f04f58 +size 97307544 diff --git a/checkpoint-4000/optimizer.pt b/checkpoint-4000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b75cc16b92f92c76208cc4d2d43f635d93275fe3 --- /dev/null +++ b/checkpoint-4000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:beebbd0e5357bf8342f0b045d8d9f79a2987edd04dc3ba50e31f898867783f38 +size 50866370 diff --git a/checkpoint-4000/rng_state.pth b/checkpoint-4000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..41a0b2293861801edc6769975e032783c1dab393 --- /dev/null +++ b/checkpoint-4000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69b3b511d3ca97b15ae5676626f70f2841ab5615dee3dacae694c48e1f247c77 +size 14244 diff --git a/checkpoint-4000/scheduler.pt b/checkpoint-4000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..05067e95c8a91f0456326ddad26a1299088624a2 --- /dev/null +++ b/checkpoint-4000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad596de31183b2179edb2e879dba0ad2c4ca6075fac0c09ce2c78df34eab82e3 +size 1064 diff --git a/checkpoint-4000/special_tokens_map.json b/checkpoint-4000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-4000/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-4000/tokenizer.json b/checkpoint-4000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-4000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-4000/tokenizer_config.json b/checkpoint-4000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfd835b0ef1033e89629af6e13ae45397e9fa2f8 --- /dev/null +++ b/checkpoint-4000/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 July 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-4000/trainer_state.json b/checkpoint-4000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..75a320d62d73c31fc81d3550ce07c924f61a21a0 --- /dev/null +++ b/checkpoint-4000/trainer_state.json @@ -0,0 +1,5700 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0548414738646101, + "eval_steps": 500, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001318304660206974, + "grad_norm": 4.59375, + "learning_rate": 0.0002, + "loss": 1.9624, + "step": 5 + }, + { + "epoch": 0.002636609320413948, + "grad_norm": 1.7421875, + "learning_rate": 0.00019986805647183008, + "loss": 0.6513, + "step": 10 + }, + { + "epoch": 0.003954913980620921, + "grad_norm": 1.84375, + "learning_rate": 0.00019973611294366012, + "loss": 0.1146, + "step": 15 + }, + { + "epoch": 0.005273218640827896, + "grad_norm": 1.3203125, + "learning_rate": 0.0001996041694154902, + "loss": 0.0529, + "step": 20 + }, + { + "epoch": 0.006591523301034869, + "grad_norm": 0.40234375, + "learning_rate": 0.00019947222588732023, + "loss": 0.1214, + "step": 25 + }, + { + "epoch": 0.007909827961241843, + "grad_norm": 1.5390625, + "learning_rate": 0.0001993402823591503, + "loss": 0.0919, + "step": 30 + }, + { + "epoch": 0.009228132621448816, + "grad_norm": 0.06201171875, + "learning_rate": 0.00019920833883098034, + "loss": 0.09, + "step": 35 + }, + { + "epoch": 0.010546437281655791, + "grad_norm": 1.53125, + "learning_rate": 0.0001990763953028104, + "loss": 0.1945, + "step": 40 + }, + { + "epoch": 0.011864741941862765, + "grad_norm": 0.2890625, + "learning_rate": 0.00019894445177464048, + "loss": 0.1259, + "step": 45 + }, + { + "epoch": 0.013183046602069738, + "grad_norm": 0.609375, + "learning_rate": 0.00019881250824647052, + "loss": 0.027, + "step": 50 + }, + { + "epoch": 0.014501351262276712, + "grad_norm": 0.369140625, + "learning_rate": 0.00019868056471830057, + "loss": 0.1068, + "step": 55 + }, + { + "epoch": 0.015819655922483685, + "grad_norm": 0.34765625, + "learning_rate": 0.00019854862119013064, + "loss": 0.0542, + "step": 60 + }, + { + "epoch": 0.01713796058269066, + "grad_norm": 0.055419921875, + "learning_rate": 0.00019841667766196068, + "loss": 0.0901, + "step": 65 + }, + { + "epoch": 0.018456265242897632, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00019828473413379075, + "loss": 0.0091, + "step": 70 + }, + { + "epoch": 0.019774569903104607, + "grad_norm": 0.0079345703125, + "learning_rate": 0.0001981527906056208, + "loss": 0.0744, + "step": 75 + }, + { + "epoch": 0.021092874563311582, + "grad_norm": 0.65234375, + "learning_rate": 0.00019802084707745086, + "loss": 0.1108, + "step": 80 + }, + { + "epoch": 0.022411179223518554, + "grad_norm": 0.50390625, + "learning_rate": 0.0001978889035492809, + "loss": 0.0446, + "step": 85 + }, + { + "epoch": 0.02372948388372553, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019775696002111097, + "loss": 0.0982, + "step": 90 + }, + { + "epoch": 0.0250477885439325, + "grad_norm": 0.490234375, + "learning_rate": 0.00019762501649294104, + "loss": 0.1035, + "step": 95 + }, + { + "epoch": 0.026366093204139476, + "grad_norm": 0.12158203125, + "learning_rate": 0.00019749307296477108, + "loss": 0.0401, + "step": 100 + }, + { + "epoch": 0.02768439786434645, + "grad_norm": 0.16015625, + "learning_rate": 0.00019736112943660115, + "loss": 0.0309, + "step": 105 + }, + { + "epoch": 0.029002702524553423, + "grad_norm": 1.359375, + "learning_rate": 0.0001972291859084312, + "loss": 0.1032, + "step": 110 + }, + { + "epoch": 0.0303210071847604, + "grad_norm": 0.52734375, + "learning_rate": 0.00019709724238026126, + "loss": 0.0811, + "step": 115 + }, + { + "epoch": 0.03163931184496737, + "grad_norm": 0.177734375, + "learning_rate": 0.00019696529885209133, + "loss": 0.0258, + "step": 120 + }, + { + "epoch": 0.03295761650517435, + "grad_norm": 0.234375, + "learning_rate": 0.00019683335532392137, + "loss": 0.0437, + "step": 125 + }, + { + "epoch": 0.03427592116538132, + "grad_norm": 1.3046875, + "learning_rate": 0.00019670141179575144, + "loss": 0.0967, + "step": 130 + }, + { + "epoch": 0.03559422582558829, + "grad_norm": 0.2734375, + "learning_rate": 0.00019656946826758148, + "loss": 0.0132, + "step": 135 + }, + { + "epoch": 0.036912530485795264, + "grad_norm": 0.66015625, + "learning_rate": 0.00019643752473941155, + "loss": 0.0396, + "step": 140 + }, + { + "epoch": 0.03823083514600224, + "grad_norm": 1.0546875, + "learning_rate": 0.0001963055812112416, + "loss": 0.0449, + "step": 145 + }, + { + "epoch": 0.039549139806209214, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019617363768307166, + "loss": 0.1196, + "step": 150 + }, + { + "epoch": 0.040867444466416186, + "grad_norm": 0.5859375, + "learning_rate": 0.0001960416941549017, + "loss": 0.0588, + "step": 155 + }, + { + "epoch": 0.042185749126623165, + "grad_norm": 0.06005859375, + "learning_rate": 0.00019590975062673175, + "loss": 0.0234, + "step": 160 + }, + { + "epoch": 0.04350405378683014, + "grad_norm": 0.4921875, + "learning_rate": 0.00019577780709856182, + "loss": 0.0916, + "step": 165 + }, + { + "epoch": 0.04482235844703711, + "grad_norm": 0.84375, + "learning_rate": 0.0001956458635703919, + "loss": 0.0271, + "step": 170 + }, + { + "epoch": 0.04614066310724409, + "grad_norm": 0.8828125, + "learning_rate": 0.00019551392004222193, + "loss": 0.0175, + "step": 175 + }, + { + "epoch": 0.04745896776745106, + "grad_norm": 0.0152587890625, + "learning_rate": 0.000195381976514052, + "loss": 0.0356, + "step": 180 + }, + { + "epoch": 0.04877727242765803, + "grad_norm": 0.09326171875, + "learning_rate": 0.00019525003298588204, + "loss": 0.0057, + "step": 185 + }, + { + "epoch": 0.050095577087865, + "grad_norm": 0.24609375, + "learning_rate": 0.0001951180894577121, + "loss": 0.0082, + "step": 190 + }, + { + "epoch": 0.05141388174807198, + "grad_norm": 0.05029296875, + "learning_rate": 0.00019498614592954215, + "loss": 0.0178, + "step": 195 + }, + { + "epoch": 0.05273218640827895, + "grad_norm": 0.0390625, + "learning_rate": 0.00019485420240137222, + "loss": 0.0789, + "step": 200 + }, + { + "epoch": 0.054050491068485924, + "grad_norm": 0.5625, + "learning_rate": 0.0001947222588732023, + "loss": 0.0645, + "step": 205 + }, + { + "epoch": 0.0553687957286929, + "grad_norm": 0.53515625, + "learning_rate": 0.00019459031534503233, + "loss": 0.116, + "step": 210 + }, + { + "epoch": 0.056687100388899875, + "grad_norm": 0.55078125, + "learning_rate": 0.0001944583718168624, + "loss": 0.0516, + "step": 215 + }, + { + "epoch": 0.058005405049106847, + "grad_norm": 0.314453125, + "learning_rate": 0.00019432642828869244, + "loss": 0.1019, + "step": 220 + }, + { + "epoch": 0.059323709709313825, + "grad_norm": 0.1123046875, + "learning_rate": 0.0001941944847605225, + "loss": 0.0529, + "step": 225 + }, + { + "epoch": 0.0606420143695208, + "grad_norm": 0.4921875, + "learning_rate": 0.00019406254123235256, + "loss": 0.0368, + "step": 230 + }, + { + "epoch": 0.06196031902972777, + "grad_norm": 0.054443359375, + "learning_rate": 0.00019393059770418262, + "loss": 0.037, + "step": 235 + }, + { + "epoch": 0.06327862368993474, + "grad_norm": 0.008544921875, + "learning_rate": 0.0001937986541760127, + "loss": 0.0324, + "step": 240 + }, + { + "epoch": 0.06459692835014172, + "grad_norm": 1.5, + "learning_rate": 0.00019366671064784274, + "loss": 0.0334, + "step": 245 + }, + { + "epoch": 0.0659152330103487, + "grad_norm": 0.2109375, + "learning_rate": 0.0001935347671196728, + "loss": 0.0671, + "step": 250 + }, + { + "epoch": 0.06723353767055566, + "grad_norm": 2.0625, + "learning_rate": 0.00019340282359150285, + "loss": 0.1559, + "step": 255 + }, + { + "epoch": 0.06855184233076264, + "grad_norm": 0.7734375, + "learning_rate": 0.0001932708800633329, + "loss": 0.0198, + "step": 260 + }, + { + "epoch": 0.06987014699096962, + "grad_norm": 0.42578125, + "learning_rate": 0.00019313893653516296, + "loss": 0.0151, + "step": 265 + }, + { + "epoch": 0.07118845165117658, + "grad_norm": 0.1884765625, + "learning_rate": 0.000193006993006993, + "loss": 0.0269, + "step": 270 + }, + { + "epoch": 0.07250675631138356, + "grad_norm": 1.546875, + "learning_rate": 0.00019287504947882307, + "loss": 0.0565, + "step": 275 + }, + { + "epoch": 0.07382506097159053, + "grad_norm": 0.5078125, + "learning_rate": 0.0001927431059506531, + "loss": 0.0942, + "step": 280 + }, + { + "epoch": 0.0751433656317975, + "grad_norm": 0.392578125, + "learning_rate": 0.00019261116242248318, + "loss": 0.0061, + "step": 285 + }, + { + "epoch": 0.07646167029200449, + "grad_norm": 1.9140625, + "learning_rate": 0.00019247921889431325, + "loss": 0.0497, + "step": 290 + }, + { + "epoch": 0.07777997495221145, + "grad_norm": 0.08837890625, + "learning_rate": 0.0001923472753661433, + "loss": 0.0573, + "step": 295 + }, + { + "epoch": 0.07909827961241843, + "grad_norm": 1.046875, + "learning_rate": 0.00019221533183797336, + "loss": 0.0528, + "step": 300 + }, + { + "epoch": 0.08041658427262541, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001920833883098034, + "loss": 0.0506, + "step": 305 + }, + { + "epoch": 0.08173488893283237, + "grad_norm": 0.08203125, + "learning_rate": 0.00019195144478163347, + "loss": 0.0307, + "step": 310 + }, + { + "epoch": 0.08305319359303935, + "grad_norm": 0.111328125, + "learning_rate": 0.00019181950125346354, + "loss": 0.0365, + "step": 315 + }, + { + "epoch": 0.08437149825324633, + "grad_norm": 1.2890625, + "learning_rate": 0.00019168755772529358, + "loss": 0.0447, + "step": 320 + }, + { + "epoch": 0.0856898029134533, + "grad_norm": 0.6015625, + "learning_rate": 0.00019155561419712365, + "loss": 0.0605, + "step": 325 + }, + { + "epoch": 0.08700810757366027, + "grad_norm": 0.71875, + "learning_rate": 0.0001914236706689537, + "loss": 0.0846, + "step": 330 + }, + { + "epoch": 0.08832641223386725, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019129172714078376, + "loss": 0.0713, + "step": 335 + }, + { + "epoch": 0.08964471689407422, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001911597836126138, + "loss": 0.0826, + "step": 340 + }, + { + "epoch": 0.0909630215542812, + "grad_norm": 2.203125, + "learning_rate": 0.00019102784008444388, + "loss": 0.0441, + "step": 345 + }, + { + "epoch": 0.09228132621448817, + "grad_norm": 1.21875, + "learning_rate": 0.00019089589655627395, + "loss": 0.1378, + "step": 350 + }, + { + "epoch": 0.09359963087469514, + "grad_norm": 3.0625, + "learning_rate": 0.00019076395302810396, + "loss": 0.1552, + "step": 355 + }, + { + "epoch": 0.09491793553490212, + "grad_norm": 0.232421875, + "learning_rate": 0.00019063200949993403, + "loss": 0.0458, + "step": 360 + }, + { + "epoch": 0.0962362401951091, + "grad_norm": 0.71875, + "learning_rate": 0.0001905000659717641, + "loss": 0.0312, + "step": 365 + }, + { + "epoch": 0.09755454485531606, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00019036812244359414, + "loss": 0.0247, + "step": 370 + }, + { + "epoch": 0.09887284951552304, + "grad_norm": 0.064453125, + "learning_rate": 0.0001902361789154242, + "loss": 0.054, + "step": 375 + }, + { + "epoch": 0.10019115417573, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019010423538725425, + "loss": 0.0023, + "step": 380 + }, + { + "epoch": 0.10150945883593698, + "grad_norm": 0.0361328125, + "learning_rate": 0.00018997229185908432, + "loss": 0.0884, + "step": 385 + }, + { + "epoch": 0.10282776349614396, + "grad_norm": 1.703125, + "learning_rate": 0.00018984034833091436, + "loss": 0.0506, + "step": 390 + }, + { + "epoch": 0.10414606815635093, + "grad_norm": 0.08837890625, + "learning_rate": 0.00018970840480274443, + "loss": 0.1123, + "step": 395 + }, + { + "epoch": 0.1054643728165579, + "grad_norm": 0.6953125, + "learning_rate": 0.0001895764612745745, + "loss": 0.0597, + "step": 400 + }, + { + "epoch": 0.10678267747676488, + "grad_norm": 0.18359375, + "learning_rate": 0.00018944451774640454, + "loss": 0.0138, + "step": 405 + }, + { + "epoch": 0.10810098213697185, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001893125742182346, + "loss": 0.0249, + "step": 410 + }, + { + "epoch": 0.10941928679717883, + "grad_norm": 0.00970458984375, + "learning_rate": 0.00018918063069006466, + "loss": 0.0084, + "step": 415 + }, + { + "epoch": 0.1107375914573858, + "grad_norm": 0.54296875, + "learning_rate": 0.00018904868716189472, + "loss": 0.0541, + "step": 420 + }, + { + "epoch": 0.11205589611759277, + "grad_norm": 0.74609375, + "learning_rate": 0.00018891674363372477, + "loss": 0.007, + "step": 425 + }, + { + "epoch": 0.11337420077779975, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00018878480010555484, + "loss": 0.0875, + "step": 430 + }, + { + "epoch": 0.11469250543800673, + "grad_norm": 0.9296875, + "learning_rate": 0.0001886528565773849, + "loss": 0.1207, + "step": 435 + }, + { + "epoch": 0.11601081009821369, + "grad_norm": 1.2734375, + "learning_rate": 0.00018852091304921495, + "loss": 0.1143, + "step": 440 + }, + { + "epoch": 0.11732911475842067, + "grad_norm": 0.6484375, + "learning_rate": 0.00018838896952104502, + "loss": 0.0393, + "step": 445 + }, + { + "epoch": 0.11864741941862765, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018825702599287506, + "loss": 0.02, + "step": 450 + }, + { + "epoch": 0.11996572407883462, + "grad_norm": 0.486328125, + "learning_rate": 0.0001881250824647051, + "loss": 0.0891, + "step": 455 + }, + { + "epoch": 0.1212840287390416, + "grad_norm": 1.0, + "learning_rate": 0.00018799313893653517, + "loss": 0.0469, + "step": 460 + }, + { + "epoch": 0.12260233339924857, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001878611954083652, + "loss": 0.019, + "step": 465 + }, + { + "epoch": 0.12392063805945554, + "grad_norm": 0.03857421875, + "learning_rate": 0.00018772925188019528, + "loss": 0.007, + "step": 470 + }, + { + "epoch": 0.12523894271966252, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00018759730835202532, + "loss": 0.0039, + "step": 475 + }, + { + "epoch": 0.12655724737986948, + "grad_norm": 0.014404296875, + "learning_rate": 0.0001874653648238554, + "loss": 0.0043, + "step": 480 + }, + { + "epoch": 0.12787555204007647, + "grad_norm": 0.51953125, + "learning_rate": 0.00018733342129568546, + "loss": 0.1326, + "step": 485 + }, + { + "epoch": 0.12919385670028344, + "grad_norm": 0.99609375, + "learning_rate": 0.0001872014777675155, + "loss": 0.0369, + "step": 490 + }, + { + "epoch": 0.1305121613604904, + "grad_norm": 0.2734375, + "learning_rate": 0.00018706953423934557, + "loss": 0.0395, + "step": 495 + }, + { + "epoch": 0.1318304660206974, + "grad_norm": 0.083984375, + "learning_rate": 0.00018693759071117561, + "loss": 0.0284, + "step": 500 + }, + { + "epoch": 0.1318304660206974, + "eval_loss": 0.04542969539761543, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.5293, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 3.685, + "step": 500 + }, + { + "epoch": 0.13314877068090436, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00018680564718300568, + "loss": 0.0533, + "step": 505 + }, + { + "epoch": 0.13446707534111133, + "grad_norm": 0.71484375, + "learning_rate": 0.00018667370365483575, + "loss": 0.0183, + "step": 510 + }, + { + "epoch": 0.13578538000131832, + "grad_norm": 0.018798828125, + "learning_rate": 0.0001865417601266658, + "loss": 0.0473, + "step": 515 + }, + { + "epoch": 0.13710368466152528, + "grad_norm": 0.388671875, + "learning_rate": 0.00018640981659849586, + "loss": 0.0562, + "step": 520 + }, + { + "epoch": 0.13842198932173225, + "grad_norm": 0.77734375, + "learning_rate": 0.0001862778730703259, + "loss": 0.0755, + "step": 525 + }, + { + "epoch": 0.13974029398193924, + "grad_norm": 2.8125, + "learning_rate": 0.00018614592954215598, + "loss": 0.0422, + "step": 530 + }, + { + "epoch": 0.1410585986421462, + "grad_norm": 0.48828125, + "learning_rate": 0.00018601398601398602, + "loss": 0.0882, + "step": 535 + }, + { + "epoch": 0.14237690330235317, + "grad_norm": 0.16015625, + "learning_rate": 0.0001858820424858161, + "loss": 0.0131, + "step": 540 + }, + { + "epoch": 0.14369520796256013, + "grad_norm": 0.31640625, + "learning_rate": 0.00018575009895764616, + "loss": 0.03, + "step": 545 + }, + { + "epoch": 0.14501351262276713, + "grad_norm": 0.0120849609375, + "learning_rate": 0.0001856181554294762, + "loss": 0.0425, + "step": 550 + }, + { + "epoch": 0.1463318172829741, + "grad_norm": 0.390625, + "learning_rate": 0.00018548621190130624, + "loss": 0.011, + "step": 555 + }, + { + "epoch": 0.14765012194318106, + "grad_norm": 1.9609375, + "learning_rate": 0.0001853542683731363, + "loss": 0.0807, + "step": 560 + }, + { + "epoch": 0.14896842660338805, + "grad_norm": 0.609375, + "learning_rate": 0.00018522232484496635, + "loss": 0.0278, + "step": 565 + }, + { + "epoch": 0.150286731263595, + "grad_norm": 0.087890625, + "learning_rate": 0.00018509038131679642, + "loss": 0.0484, + "step": 570 + }, + { + "epoch": 0.15160503592380198, + "grad_norm": 0.5078125, + "learning_rate": 0.00018495843778862646, + "loss": 0.1277, + "step": 575 + }, + { + "epoch": 0.15292334058400897, + "grad_norm": 0.8125, + "learning_rate": 0.00018482649426045653, + "loss": 0.058, + "step": 580 + }, + { + "epoch": 0.15424164524421594, + "grad_norm": 0.22265625, + "learning_rate": 0.00018469455073228657, + "loss": 0.0259, + "step": 585 + }, + { + "epoch": 0.1555599499044229, + "grad_norm": 1.8984375, + "learning_rate": 0.00018456260720411664, + "loss": 0.113, + "step": 590 + }, + { + "epoch": 0.1568782545646299, + "grad_norm": 0.12451171875, + "learning_rate": 0.0001844306636759467, + "loss": 0.0312, + "step": 595 + }, + { + "epoch": 0.15819655922483686, + "grad_norm": 0.0322265625, + "learning_rate": 0.00018429872014777676, + "loss": 0.0476, + "step": 600 + }, + { + "epoch": 0.15951486388504382, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00018416677661960682, + "loss": 0.0232, + "step": 605 + }, + { + "epoch": 0.16083316854525082, + "grad_norm": 0.57421875, + "learning_rate": 0.00018403483309143687, + "loss": 0.1287, + "step": 610 + }, + { + "epoch": 0.16215147320545778, + "grad_norm": 0.765625, + "learning_rate": 0.00018390288956326694, + "loss": 0.0991, + "step": 615 + }, + { + "epoch": 0.16346977786566474, + "grad_norm": 0.3125, + "learning_rate": 0.00018377094603509698, + "loss": 0.0247, + "step": 620 + }, + { + "epoch": 0.16478808252587174, + "grad_norm": 0.37890625, + "learning_rate": 0.00018363900250692705, + "loss": 0.0632, + "step": 625 + }, + { + "epoch": 0.1661063871860787, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018350705897875712, + "loss": 0.0314, + "step": 630 + }, + { + "epoch": 0.16742469184628567, + "grad_norm": 0.0673828125, + "learning_rate": 0.00018337511545058716, + "loss": 0.0425, + "step": 635 + }, + { + "epoch": 0.16874299650649266, + "grad_norm": 0.396484375, + "learning_rate": 0.00018324317192241723, + "loss": 0.0613, + "step": 640 + }, + { + "epoch": 0.17006130116669962, + "grad_norm": 0.057373046875, + "learning_rate": 0.00018311122839424727, + "loss": 0.0569, + "step": 645 + }, + { + "epoch": 0.1713796058269066, + "grad_norm": 0.001373291015625, + "learning_rate": 0.00018297928486607734, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 0.17269791048711358, + "grad_norm": 1.0859375, + "learning_rate": 0.00018284734133790738, + "loss": 0.0189, + "step": 655 + }, + { + "epoch": 0.17401621514732055, + "grad_norm": 0.6015625, + "learning_rate": 0.00018271539780973742, + "loss": 0.0601, + "step": 660 + }, + { + "epoch": 0.1753345198075275, + "grad_norm": 0.25390625, + "learning_rate": 0.0001825834542815675, + "loss": 0.0211, + "step": 665 + }, + { + "epoch": 0.1766528244677345, + "grad_norm": 2.6875, + "learning_rate": 0.00018245151075339753, + "loss": 0.0713, + "step": 670 + }, + { + "epoch": 0.17797112912794147, + "grad_norm": 1.1875, + "learning_rate": 0.0001823195672252276, + "loss": 0.0522, + "step": 675 + }, + { + "epoch": 0.17928943378814843, + "grad_norm": 0.025146484375, + "learning_rate": 0.00018218762369705767, + "loss": 0.0242, + "step": 680 + }, + { + "epoch": 0.18060773844835543, + "grad_norm": 0.048095703125, + "learning_rate": 0.00018205568016888772, + "loss": 0.0129, + "step": 685 + }, + { + "epoch": 0.1819260431085624, + "grad_norm": 0.04541015625, + "learning_rate": 0.00018192373664071778, + "loss": 0.0142, + "step": 690 + }, + { + "epoch": 0.18324434776876936, + "grad_norm": 0.00830078125, + "learning_rate": 0.00018179179311254783, + "loss": 0.0121, + "step": 695 + }, + { + "epoch": 0.18456265242897635, + "grad_norm": 0.53125, + "learning_rate": 0.0001816598495843779, + "loss": 0.0163, + "step": 700 + }, + { + "epoch": 0.1858809570891833, + "grad_norm": 0.185546875, + "learning_rate": 0.00018152790605620796, + "loss": 0.0203, + "step": 705 + }, + { + "epoch": 0.18719926174939028, + "grad_norm": 1.2578125, + "learning_rate": 0.000181395962528038, + "loss": 0.1548, + "step": 710 + }, + { + "epoch": 0.18851756640959727, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00018126401899986808, + "loss": 0.0543, + "step": 715 + }, + { + "epoch": 0.18983587106980424, + "grad_norm": 0.07568359375, + "learning_rate": 0.00018113207547169812, + "loss": 0.0346, + "step": 720 + }, + { + "epoch": 0.1911541757300112, + "grad_norm": 0.1318359375, + "learning_rate": 0.0001810001319435282, + "loss": 0.03, + "step": 725 + }, + { + "epoch": 0.1924724803902182, + "grad_norm": 0.1455078125, + "learning_rate": 0.00018086818841535823, + "loss": 0.0796, + "step": 730 + }, + { + "epoch": 0.19379078505042516, + "grad_norm": 0.09814453125, + "learning_rate": 0.0001807362448871883, + "loss": 0.0662, + "step": 735 + }, + { + "epoch": 0.19510908971063212, + "grad_norm": 0.91015625, + "learning_rate": 0.00018060430135901837, + "loss": 0.0675, + "step": 740 + }, + { + "epoch": 0.19642739437083911, + "grad_norm": 0.10693359375, + "learning_rate": 0.0001804723578308484, + "loss": 0.0377, + "step": 745 + }, + { + "epoch": 0.19774569903104608, + "grad_norm": 0.95703125, + "learning_rate": 0.00018034041430267848, + "loss": 0.0174, + "step": 750 + }, + { + "epoch": 0.19906400369125304, + "grad_norm": 1.7890625, + "learning_rate": 0.00018020847077450852, + "loss": 0.0278, + "step": 755 + }, + { + "epoch": 0.20038230835146, + "grad_norm": 0.8515625, + "learning_rate": 0.00018007652724633856, + "loss": 0.0113, + "step": 760 + }, + { + "epoch": 0.201700613011667, + "grad_norm": 0.016845703125, + "learning_rate": 0.00017994458371816863, + "loss": 0.0589, + "step": 765 + }, + { + "epoch": 0.20301891767187397, + "grad_norm": 0.01043701171875, + "learning_rate": 0.00017981264018999867, + "loss": 0.0203, + "step": 770 + }, + { + "epoch": 0.20433722233208093, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00017968069666182874, + "loss": 0.0494, + "step": 775 + }, + { + "epoch": 0.20565552699228792, + "grad_norm": 0.56640625, + "learning_rate": 0.00017954875313365879, + "loss": 0.0394, + "step": 780 + }, + { + "epoch": 0.2069738316524949, + "grad_norm": 0.06591796875, + "learning_rate": 0.00017941680960548886, + "loss": 0.0848, + "step": 785 + }, + { + "epoch": 0.20829213631270185, + "grad_norm": 0.40234375, + "learning_rate": 0.00017928486607731892, + "loss": 0.0464, + "step": 790 + }, + { + "epoch": 0.20961044097290885, + "grad_norm": 0.06298828125, + "learning_rate": 0.00017915292254914897, + "loss": 0.0222, + "step": 795 + }, + { + "epoch": 0.2109287456331158, + "grad_norm": 0.5390625, + "learning_rate": 0.00017902097902097904, + "loss": 0.0434, + "step": 800 + }, + { + "epoch": 0.21224705029332278, + "grad_norm": 1.390625, + "learning_rate": 0.00017888903549280908, + "loss": 0.0222, + "step": 805 + }, + { + "epoch": 0.21356535495352977, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00017875709196463915, + "loss": 0.0099, + "step": 810 + }, + { + "epoch": 0.21488365961373673, + "grad_norm": 0.10009765625, + "learning_rate": 0.0001786251484364692, + "loss": 0.0086, + "step": 815 + }, + { + "epoch": 0.2162019642739437, + "grad_norm": 0.06396484375, + "learning_rate": 0.00017849320490829926, + "loss": 0.0715, + "step": 820 + }, + { + "epoch": 0.2175202689341507, + "grad_norm": 0.365234375, + "learning_rate": 0.00017836126138012933, + "loss": 0.0642, + "step": 825 + }, + { + "epoch": 0.21883857359435765, + "grad_norm": 0.01519775390625, + "learning_rate": 0.00017822931785195937, + "loss": 0.0111, + "step": 830 + }, + { + "epoch": 0.22015687825456462, + "grad_norm": 1.1640625, + "learning_rate": 0.00017809737432378944, + "loss": 0.0518, + "step": 835 + }, + { + "epoch": 0.2214751829147716, + "grad_norm": 0.00921630859375, + "learning_rate": 0.00017796543079561948, + "loss": 0.0384, + "step": 840 + }, + { + "epoch": 0.22279348757497858, + "grad_norm": 0.33984375, + "learning_rate": 0.00017783348726744955, + "loss": 0.0204, + "step": 845 + }, + { + "epoch": 0.22411179223518554, + "grad_norm": 0.294921875, + "learning_rate": 0.00017770154373927962, + "loss": 0.0075, + "step": 850 + }, + { + "epoch": 0.22543009689539253, + "grad_norm": 0.033203125, + "learning_rate": 0.00017756960021110963, + "loss": 0.0895, + "step": 855 + }, + { + "epoch": 0.2267484015555995, + "grad_norm": 0.08056640625, + "learning_rate": 0.0001774376566829397, + "loss": 0.1039, + "step": 860 + }, + { + "epoch": 0.22806670621580646, + "grad_norm": 0.55078125, + "learning_rate": 0.00017730571315476975, + "loss": 0.0125, + "step": 865 + }, + { + "epoch": 0.22938501087601346, + "grad_norm": 0.5859375, + "learning_rate": 0.00017717376962659982, + "loss": 0.0381, + "step": 870 + }, + { + "epoch": 0.23070331553622042, + "grad_norm": 0.029052734375, + "learning_rate": 0.00017704182609842988, + "loss": 0.0434, + "step": 875 + }, + { + "epoch": 0.23202162019642739, + "grad_norm": 0.43359375, + "learning_rate": 0.00017690988257025993, + "loss": 0.0799, + "step": 880 + }, + { + "epoch": 0.23333992485663438, + "grad_norm": 0.04150390625, + "learning_rate": 0.00017677793904209, + "loss": 0.0692, + "step": 885 + }, + { + "epoch": 0.23465822951684134, + "grad_norm": 0.435546875, + "learning_rate": 0.00017664599551392004, + "loss": 0.0544, + "step": 890 + }, + { + "epoch": 0.2359765341770483, + "grad_norm": 1.171875, + "learning_rate": 0.0001765140519857501, + "loss": 0.0619, + "step": 895 + }, + { + "epoch": 0.2372948388372553, + "grad_norm": 0.01263427734375, + "learning_rate": 0.00017638210845758018, + "loss": 0.0418, + "step": 900 + }, + { + "epoch": 0.23861314349746227, + "grad_norm": 0.017578125, + "learning_rate": 0.00017625016492941022, + "loss": 0.0195, + "step": 905 + }, + { + "epoch": 0.23993144815766923, + "grad_norm": 0.6171875, + "learning_rate": 0.0001761182214012403, + "loss": 0.067, + "step": 910 + }, + { + "epoch": 0.24124975281787622, + "grad_norm": 0.59765625, + "learning_rate": 0.00017598627787307033, + "loss": 0.049, + "step": 915 + }, + { + "epoch": 0.2425680574780832, + "grad_norm": 1.2421875, + "learning_rate": 0.0001758543343449004, + "loss": 0.0539, + "step": 920 + }, + { + "epoch": 0.24388636213829015, + "grad_norm": 0.10302734375, + "learning_rate": 0.00017572239081673044, + "loss": 0.0725, + "step": 925 + }, + { + "epoch": 0.24520466679849715, + "grad_norm": 0.330078125, + "learning_rate": 0.0001755904472885605, + "loss": 0.064, + "step": 930 + }, + { + "epoch": 0.2465229714587041, + "grad_norm": 0.220703125, + "learning_rate": 0.00017545850376039058, + "loss": 0.0271, + "step": 935 + }, + { + "epoch": 0.24784127611891107, + "grad_norm": 0.01470947265625, + "learning_rate": 0.00017532656023222062, + "loss": 0.0247, + "step": 940 + }, + { + "epoch": 0.24915958077911807, + "grad_norm": 0.013427734375, + "learning_rate": 0.0001751946167040507, + "loss": 0.017, + "step": 945 + }, + { + "epoch": 0.25047788543932503, + "grad_norm": 0.58984375, + "learning_rate": 0.00017506267317588073, + "loss": 0.0254, + "step": 950 + }, + { + "epoch": 0.251796190099532, + "grad_norm": 0.412109375, + "learning_rate": 0.00017493072964771078, + "loss": 0.0186, + "step": 955 + }, + { + "epoch": 0.25311449475973896, + "grad_norm": 0.66796875, + "learning_rate": 0.00017479878611954084, + "loss": 0.0617, + "step": 960 + }, + { + "epoch": 0.25443279941994595, + "grad_norm": 0.322265625, + "learning_rate": 0.00017466684259137089, + "loss": 0.0173, + "step": 965 + }, + { + "epoch": 0.25575110408015295, + "grad_norm": 0.83203125, + "learning_rate": 0.00017453489906320096, + "loss": 0.0512, + "step": 970 + }, + { + "epoch": 0.2570694087403599, + "grad_norm": 0.08447265625, + "learning_rate": 0.000174402955535031, + "loss": 0.0361, + "step": 975 + }, + { + "epoch": 0.2583877134005669, + "grad_norm": 0.423828125, + "learning_rate": 0.00017427101200686107, + "loss": 0.0175, + "step": 980 + }, + { + "epoch": 0.25970601806077387, + "grad_norm": 0.77734375, + "learning_rate": 0.00017413906847869114, + "loss": 0.0139, + "step": 985 + }, + { + "epoch": 0.2610243227209808, + "grad_norm": 0.515625, + "learning_rate": 0.00017400712495052118, + "loss": 0.0948, + "step": 990 + }, + { + "epoch": 0.2623426273811878, + "grad_norm": 1.421875, + "learning_rate": 0.00017387518142235125, + "loss": 0.0406, + "step": 995 + }, + { + "epoch": 0.2636609320413948, + "grad_norm": 0.058837890625, + "learning_rate": 0.0001737432378941813, + "loss": 0.1011, + "step": 1000 + }, + { + "epoch": 0.2636609320413948, + "eval_loss": 0.045552924275398254, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.6113, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 3.684, + "step": 1000 + }, + { + "epoch": 0.26497923670160173, + "grad_norm": 0.380859375, + "learning_rate": 0.00017361129436601136, + "loss": 0.0711, + "step": 1005 + }, + { + "epoch": 0.2662975413618087, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00017347935083784143, + "loss": 0.0218, + "step": 1010 + }, + { + "epoch": 0.2676158460220157, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017334740730967147, + "loss": 0.0301, + "step": 1015 + }, + { + "epoch": 0.26893415068222265, + "grad_norm": 0.2734375, + "learning_rate": 0.00017321546378150154, + "loss": 0.0721, + "step": 1020 + }, + { + "epoch": 0.27025245534242964, + "grad_norm": 0.25390625, + "learning_rate": 0.00017308352025333158, + "loss": 0.0363, + "step": 1025 + }, + { + "epoch": 0.27157076000263664, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017295157672516165, + "loss": 0.0313, + "step": 1030 + }, + { + "epoch": 0.2728890646628436, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0001728196331969917, + "loss": 0.0385, + "step": 1035 + }, + { + "epoch": 0.27420736932305056, + "grad_norm": 0.00787353515625, + "learning_rate": 0.00017268768966882176, + "loss": 0.0405, + "step": 1040 + }, + { + "epoch": 0.27552567398325756, + "grad_norm": 0.484375, + "learning_rate": 0.00017255574614065183, + "loss": 0.0616, + "step": 1045 + }, + { + "epoch": 0.2768439786434645, + "grad_norm": 0.0908203125, + "learning_rate": 0.00017242380261248185, + "loss": 0.0057, + "step": 1050 + }, + { + "epoch": 0.2781622833036715, + "grad_norm": 0.1904296875, + "learning_rate": 0.00017229185908431192, + "loss": 0.0417, + "step": 1055 + }, + { + "epoch": 0.2794805879638785, + "grad_norm": 0.30078125, + "learning_rate": 0.00017215991555614196, + "loss": 0.0346, + "step": 1060 + }, + { + "epoch": 0.2807988926240854, + "grad_norm": 0.016357421875, + "learning_rate": 0.00017202797202797203, + "loss": 0.0295, + "step": 1065 + }, + { + "epoch": 0.2821171972842924, + "grad_norm": 0.490234375, + "learning_rate": 0.0001718960284998021, + "loss": 0.0448, + "step": 1070 + }, + { + "epoch": 0.28343550194449935, + "grad_norm": 0.004241943359375, + "learning_rate": 0.00017176408497163214, + "loss": 0.0051, + "step": 1075 + }, + { + "epoch": 0.28475380660470634, + "grad_norm": 0.01904296875, + "learning_rate": 0.0001716321414434622, + "loss": 0.0894, + "step": 1080 + }, + { + "epoch": 0.28607211126491333, + "grad_norm": 0.83984375, + "learning_rate": 0.00017150019791529225, + "loss": 0.0288, + "step": 1085 + }, + { + "epoch": 0.28739041592512027, + "grad_norm": 0.2021484375, + "learning_rate": 0.00017136825438712232, + "loss": 0.0222, + "step": 1090 + }, + { + "epoch": 0.28870872058532726, + "grad_norm": 0.322265625, + "learning_rate": 0.0001712363108589524, + "loss": 0.0444, + "step": 1095 + }, + { + "epoch": 0.29002702524553425, + "grad_norm": 0.408203125, + "learning_rate": 0.00017110436733078243, + "loss": 0.0828, + "step": 1100 + }, + { + "epoch": 0.2913453299057412, + "grad_norm": 0.04052734375, + "learning_rate": 0.0001709724238026125, + "loss": 0.0725, + "step": 1105 + }, + { + "epoch": 0.2926636345659482, + "grad_norm": 0.2578125, + "learning_rate": 0.00017084048027444254, + "loss": 0.0204, + "step": 1110 + }, + { + "epoch": 0.2939819392261552, + "grad_norm": 0.67578125, + "learning_rate": 0.0001707085367462726, + "loss": 0.0503, + "step": 1115 + }, + { + "epoch": 0.2953002438863621, + "grad_norm": 0.0059814453125, + "learning_rate": 0.00017057659321810265, + "loss": 0.0144, + "step": 1120 + }, + { + "epoch": 0.2966185485465691, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00017044464968993272, + "loss": 0.0044, + "step": 1125 + }, + { + "epoch": 0.2979368532067761, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001703127061617628, + "loss": 0.013, + "step": 1130 + }, + { + "epoch": 0.29925515786698303, + "grad_norm": 0.287109375, + "learning_rate": 0.00017018076263359283, + "loss": 0.0245, + "step": 1135 + }, + { + "epoch": 0.30057346252719, + "grad_norm": 0.26171875, + "learning_rate": 0.0001700488191054229, + "loss": 0.0247, + "step": 1140 + }, + { + "epoch": 0.301891767187397, + "grad_norm": 0.40625, + "learning_rate": 0.00016991687557725294, + "loss": 0.0402, + "step": 1145 + }, + { + "epoch": 0.30321007184760396, + "grad_norm": 1.2578125, + "learning_rate": 0.000169784932049083, + "loss": 0.0071, + "step": 1150 + }, + { + "epoch": 0.30452837650781095, + "grad_norm": 0.330078125, + "learning_rate": 0.00016965298852091306, + "loss": 0.0177, + "step": 1155 + }, + { + "epoch": 0.30584668116801794, + "grad_norm": 0.07275390625, + "learning_rate": 0.0001695210449927431, + "loss": 0.0029, + "step": 1160 + }, + { + "epoch": 0.3071649858282249, + "grad_norm": 0.455078125, + "learning_rate": 0.00016938910146457317, + "loss": 0.0262, + "step": 1165 + }, + { + "epoch": 0.30848329048843187, + "grad_norm": 0.002655029296875, + "learning_rate": 0.0001692571579364032, + "loss": 0.0346, + "step": 1170 + }, + { + "epoch": 0.30980159514863886, + "grad_norm": 0.1748046875, + "learning_rate": 0.00016912521440823328, + "loss": 0.0494, + "step": 1175 + }, + { + "epoch": 0.3111198998088458, + "grad_norm": 1.4609375, + "learning_rate": 0.00016899327088006335, + "loss": 0.0603, + "step": 1180 + }, + { + "epoch": 0.3124382044690528, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001688613273518934, + "loss": 0.0366, + "step": 1185 + }, + { + "epoch": 0.3137565091292598, + "grad_norm": 0.01422119140625, + "learning_rate": 0.00016872938382372346, + "loss": 0.0678, + "step": 1190 + }, + { + "epoch": 0.3150748137894667, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001685974402955535, + "loss": 0.0359, + "step": 1195 + }, + { + "epoch": 0.3163931184496737, + "grad_norm": 0.275390625, + "learning_rate": 0.00016846549676738357, + "loss": 0.1099, + "step": 1200 + }, + { + "epoch": 0.3177114231098807, + "grad_norm": 0.212890625, + "learning_rate": 0.00016833355323921364, + "loss": 0.0343, + "step": 1205 + }, + { + "epoch": 0.31902972777008765, + "grad_norm": 0.0302734375, + "learning_rate": 0.00016820160971104368, + "loss": 0.0138, + "step": 1210 + }, + { + "epoch": 0.32034803243029464, + "grad_norm": 0.016845703125, + "learning_rate": 0.00016806966618287375, + "loss": 0.0202, + "step": 1215 + }, + { + "epoch": 0.32166633709050163, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001679377226547038, + "loss": 0.0442, + "step": 1220 + }, + { + "epoch": 0.32298464175070857, + "grad_norm": 0.049072265625, + "learning_rate": 0.00016780577912653386, + "loss": 0.0375, + "step": 1225 + }, + { + "epoch": 0.32430294641091556, + "grad_norm": 0.1337890625, + "learning_rate": 0.0001676738355983639, + "loss": 0.01, + "step": 1230 + }, + { + "epoch": 0.32562125107112255, + "grad_norm": 0.02197265625, + "learning_rate": 0.00016754189207019397, + "loss": 0.0139, + "step": 1235 + }, + { + "epoch": 0.3269395557313295, + "grad_norm": 0.09228515625, + "learning_rate": 0.00016740994854202404, + "loss": 0.014, + "step": 1240 + }, + { + "epoch": 0.3282578603915365, + "grad_norm": 0.47265625, + "learning_rate": 0.00016727800501385408, + "loss": 0.1546, + "step": 1245 + }, + { + "epoch": 0.3295761650517435, + "grad_norm": 0.02294921875, + "learning_rate": 0.00016714606148568413, + "loss": 0.0803, + "step": 1250 + }, + { + "epoch": 0.3308944697119504, + "grad_norm": 0.185546875, + "learning_rate": 0.00016701411795751417, + "loss": 0.0376, + "step": 1255 + }, + { + "epoch": 0.3322127743721574, + "grad_norm": 0.1123046875, + "learning_rate": 0.00016688217442934424, + "loss": 0.0375, + "step": 1260 + }, + { + "epoch": 0.3335310790323644, + "grad_norm": 1.03125, + "learning_rate": 0.0001667502309011743, + "loss": 0.0442, + "step": 1265 + }, + { + "epoch": 0.33484938369257133, + "grad_norm": 0.0172119140625, + "learning_rate": 0.00016661828737300435, + "loss": 0.0261, + "step": 1270 + }, + { + "epoch": 0.3361676883527783, + "grad_norm": 0.42578125, + "learning_rate": 0.00016648634384483442, + "loss": 0.0553, + "step": 1275 + }, + { + "epoch": 0.3374859930129853, + "grad_norm": 0.1328125, + "learning_rate": 0.00016635440031666446, + "loss": 0.0065, + "step": 1280 + }, + { + "epoch": 0.33880429767319226, + "grad_norm": 0.263671875, + "learning_rate": 0.00016622245678849453, + "loss": 0.0527, + "step": 1285 + }, + { + "epoch": 0.34012260233339925, + "grad_norm": 0.314453125, + "learning_rate": 0.0001660905132603246, + "loss": 0.0297, + "step": 1290 + }, + { + "epoch": 0.34144090699360624, + "grad_norm": 0.04345703125, + "learning_rate": 0.00016595856973215464, + "loss": 0.0477, + "step": 1295 + }, + { + "epoch": 0.3427592116538132, + "grad_norm": 0.08154296875, + "learning_rate": 0.0001658266262039847, + "loss": 0.0298, + "step": 1300 + }, + { + "epoch": 0.34407751631402017, + "grad_norm": 0.08935546875, + "learning_rate": 0.00016569468267581475, + "loss": 0.0481, + "step": 1305 + }, + { + "epoch": 0.34539582097422716, + "grad_norm": 0.06640625, + "learning_rate": 0.00016556273914764482, + "loss": 0.0153, + "step": 1310 + }, + { + "epoch": 0.3467141256344341, + "grad_norm": 0.00592041015625, + "learning_rate": 0.00016543079561947486, + "loss": 0.0111, + "step": 1315 + }, + { + "epoch": 0.3480324302946411, + "grad_norm": 0.2236328125, + "learning_rate": 0.00016529885209130493, + "loss": 0.0309, + "step": 1320 + }, + { + "epoch": 0.3493507349548481, + "grad_norm": 0.0198974609375, + "learning_rate": 0.000165166908563135, + "loss": 0.0579, + "step": 1325 + }, + { + "epoch": 0.350669039615055, + "grad_norm": 0.10107421875, + "learning_rate": 0.00016503496503496504, + "loss": 0.0055, + "step": 1330 + }, + { + "epoch": 0.351987344275262, + "grad_norm": 0.71875, + "learning_rate": 0.00016490302150679511, + "loss": 0.0299, + "step": 1335 + }, + { + "epoch": 0.353305648935469, + "grad_norm": 0.01348876953125, + "learning_rate": 0.00016477107797862516, + "loss": 0.0943, + "step": 1340 + }, + { + "epoch": 0.35462395359567594, + "grad_norm": 0.3046875, + "learning_rate": 0.00016463913445045523, + "loss": 0.0216, + "step": 1345 + }, + { + "epoch": 0.35594225825588294, + "grad_norm": 0.02392578125, + "learning_rate": 0.00016450719092228527, + "loss": 0.0265, + "step": 1350 + }, + { + "epoch": 0.35726056291608993, + "grad_norm": 0.453125, + "learning_rate": 0.0001643752473941153, + "loss": 0.0539, + "step": 1355 + }, + { + "epoch": 0.35857886757629687, + "grad_norm": 0.00823974609375, + "learning_rate": 0.00016424330386594538, + "loss": 0.0139, + "step": 1360 + }, + { + "epoch": 0.35989717223650386, + "grad_norm": 0.55859375, + "learning_rate": 0.00016411136033777542, + "loss": 0.0428, + "step": 1365 + }, + { + "epoch": 0.36121547689671085, + "grad_norm": 0.052734375, + "learning_rate": 0.0001639794168096055, + "loss": 0.0346, + "step": 1370 + }, + { + "epoch": 0.3625337815569178, + "grad_norm": 0.12158203125, + "learning_rate": 0.00016384747328143556, + "loss": 0.0095, + "step": 1375 + }, + { + "epoch": 0.3638520862171248, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0001637155297532656, + "loss": 0.0224, + "step": 1380 + }, + { + "epoch": 0.3651703908773318, + "grad_norm": 0.01318359375, + "learning_rate": 0.00016358358622509567, + "loss": 0.0316, + "step": 1385 + }, + { + "epoch": 0.3664886955375387, + "grad_norm": 0.011962890625, + "learning_rate": 0.0001634516426969257, + "loss": 0.0051, + "step": 1390 + }, + { + "epoch": 0.3678070001977457, + "grad_norm": 0.00396728515625, + "learning_rate": 0.00016331969916875578, + "loss": 0.038, + "step": 1395 + }, + { + "epoch": 0.3691253048579527, + "grad_norm": 0.375, + "learning_rate": 0.00016318775564058585, + "loss": 0.029, + "step": 1400 + }, + { + "epoch": 0.37044360951815963, + "grad_norm": 0.265625, + "learning_rate": 0.0001630558121124159, + "loss": 0.0072, + "step": 1405 + }, + { + "epoch": 0.3717619141783666, + "grad_norm": 0.00127410888671875, + "learning_rate": 0.00016292386858424596, + "loss": 0.0381, + "step": 1410 + }, + { + "epoch": 0.3730802188385736, + "grad_norm": 1.15625, + "learning_rate": 0.000162791925056076, + "loss": 0.0573, + "step": 1415 + }, + { + "epoch": 0.37439852349878056, + "grad_norm": 0.0244140625, + "learning_rate": 0.00016265998152790607, + "loss": 0.051, + "step": 1420 + }, + { + "epoch": 0.37571682815898755, + "grad_norm": 0.0015106201171875, + "learning_rate": 0.00016252803799973612, + "loss": 0.0239, + "step": 1425 + }, + { + "epoch": 0.37703513281919454, + "grad_norm": 0.26953125, + "learning_rate": 0.00016239609447156618, + "loss": 0.0165, + "step": 1430 + }, + { + "epoch": 0.3783534374794015, + "grad_norm": 0.006134033203125, + "learning_rate": 0.00016226415094339625, + "loss": 0.0071, + "step": 1435 + }, + { + "epoch": 0.37967174213960847, + "grad_norm": 2.828125, + "learning_rate": 0.0001621322074152263, + "loss": 0.0272, + "step": 1440 + }, + { + "epoch": 0.38099004679981546, + "grad_norm": 0.349609375, + "learning_rate": 0.00016200026388705637, + "loss": 0.0647, + "step": 1445 + }, + { + "epoch": 0.3823083514600224, + "grad_norm": 0.09326171875, + "learning_rate": 0.00016186832035888638, + "loss": 0.0262, + "step": 1450 + }, + { + "epoch": 0.3836266561202294, + "grad_norm": 0.041015625, + "learning_rate": 0.00016173637683071645, + "loss": 0.0576, + "step": 1455 + }, + { + "epoch": 0.3849449607804364, + "grad_norm": 0.033935546875, + "learning_rate": 0.00016160443330254652, + "loss": 0.0142, + "step": 1460 + }, + { + "epoch": 0.3862632654406433, + "grad_norm": 0.09130859375, + "learning_rate": 0.00016147248977437656, + "loss": 0.0348, + "step": 1465 + }, + { + "epoch": 0.3875815701008503, + "grad_norm": 2.390625, + "learning_rate": 0.00016134054624620663, + "loss": 0.0672, + "step": 1470 + }, + { + "epoch": 0.3888998747610573, + "grad_norm": 0.439453125, + "learning_rate": 0.00016120860271803667, + "loss": 0.0121, + "step": 1475 + }, + { + "epoch": 0.39021817942126424, + "grad_norm": 0.1298828125, + "learning_rate": 0.00016107665918986674, + "loss": 0.0114, + "step": 1480 + }, + { + "epoch": 0.39153648408147124, + "grad_norm": 0.85546875, + "learning_rate": 0.0001609447156616968, + "loss": 0.0968, + "step": 1485 + }, + { + "epoch": 0.39285478874167823, + "grad_norm": 0.703125, + "learning_rate": 0.00016081277213352685, + "loss": 0.0349, + "step": 1490 + }, + { + "epoch": 0.39417309340188517, + "grad_norm": 0.021728515625, + "learning_rate": 0.00016068082860535692, + "loss": 0.0106, + "step": 1495 + }, + { + "epoch": 0.39549139806209216, + "grad_norm": 0.7265625, + "learning_rate": 0.00016054888507718696, + "loss": 0.0225, + "step": 1500 + }, + { + "epoch": 0.39549139806209216, + "eval_loss": 0.03515048325061798, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.3497, + "eval_samples_per_second": 7.373, + "eval_steps_per_second": 3.686, + "step": 1500 + }, + { + "epoch": 0.3968097027222991, + "grad_norm": 0.016519820317626, + "learning_rate": 0.00016041694154901703, + "loss": 0.0202, + "step": 1505 + }, + { + "epoch": 0.3981280073825061, + "grad_norm": 0.8505942225456238, + "learning_rate": 0.00016028499802084708, + "loss": 0.0541, + "step": 1510 + }, + { + "epoch": 0.3994463120427131, + "grad_norm": 0.04163295030593872, + "learning_rate": 0.00016015305449267714, + "loss": 0.0037, + "step": 1515 + }, + { + "epoch": 0.40076461670292, + "grad_norm": 0.011332935653626919, + "learning_rate": 0.00016002111096450721, + "loss": 0.0459, + "step": 1520 + }, + { + "epoch": 0.402082921363127, + "grad_norm": 0.9360129833221436, + "learning_rate": 0.00015988916743633726, + "loss": 0.013, + "step": 1525 + }, + { + "epoch": 0.403401226023334, + "grad_norm": 0.11991436779499054, + "learning_rate": 0.00015975722390816733, + "loss": 0.0079, + "step": 1530 + }, + { + "epoch": 0.40471953068354094, + "grad_norm": 0.36911076307296753, + "learning_rate": 0.00015962528037999737, + "loss": 0.0638, + "step": 1535 + }, + { + "epoch": 0.40603783534374793, + "grad_norm": 0.020278634503483772, + "learning_rate": 0.00015949333685182744, + "loss": 0.0217, + "step": 1540 + }, + { + "epoch": 0.4073561400039549, + "grad_norm": 0.14263059198856354, + "learning_rate": 0.0001593613933236575, + "loss": 0.0495, + "step": 1545 + }, + { + "epoch": 0.40867444466416186, + "grad_norm": 0.09494803845882416, + "learning_rate": 0.00015922944979548752, + "loss": 0.0248, + "step": 1550 + }, + { + "epoch": 0.40999274932436885, + "grad_norm": 0.23064319789409637, + "learning_rate": 0.0001590975062673176, + "loss": 0.0285, + "step": 1555 + }, + { + "epoch": 0.41131105398457585, + "grad_norm": 0.32220256328582764, + "learning_rate": 0.00015896556273914763, + "loss": 0.0537, + "step": 1560 + }, + { + "epoch": 0.4126293586447828, + "grad_norm": 0.41208815574645996, + "learning_rate": 0.0001588336192109777, + "loss": 0.0453, + "step": 1565 + }, + { + "epoch": 0.4139476633049898, + "grad_norm": 0.03775424137711525, + "learning_rate": 0.00015870167568280777, + "loss": 0.0134, + "step": 1570 + }, + { + "epoch": 0.41526596796519677, + "grad_norm": 0.6526333093643188, + "learning_rate": 0.0001585697321546378, + "loss": 0.0329, + "step": 1575 + }, + { + "epoch": 0.4165842726254037, + "grad_norm": 1.001305103302002, + "learning_rate": 0.00015843778862646788, + "loss": 0.0912, + "step": 1580 + }, + { + "epoch": 0.4179025772856107, + "grad_norm": 0.4055219888687134, + "learning_rate": 0.00015830584509829792, + "loss": 0.0519, + "step": 1585 + }, + { + "epoch": 0.4192208819458177, + "grad_norm": 0.035015616565942764, + "learning_rate": 0.000158173901570128, + "loss": 0.0191, + "step": 1590 + }, + { + "epoch": 0.42053918660602463, + "grad_norm": 0.09326844662427902, + "learning_rate": 0.00015804195804195806, + "loss": 0.0106, + "step": 1595 + }, + { + "epoch": 0.4218574912662316, + "grad_norm": 0.06223440542817116, + "learning_rate": 0.0001579100145137881, + "loss": 0.0113, + "step": 1600 + }, + { + "epoch": 0.4231757959264386, + "grad_norm": 0.0625135526061058, + "learning_rate": 0.00015777807098561817, + "loss": 0.0191, + "step": 1605 + }, + { + "epoch": 0.42449410058664555, + "grad_norm": 0.2645983099937439, + "learning_rate": 0.00015764612745744822, + "loss": 0.0829, + "step": 1610 + }, + { + "epoch": 0.42581240524685254, + "grad_norm": 0.009632415138185024, + "learning_rate": 0.00015751418392927829, + "loss": 0.0542, + "step": 1615 + }, + { + "epoch": 0.42713070990705954, + "grad_norm": 0.01979319378733635, + "learning_rate": 0.00015738224040110833, + "loss": 0.0517, + "step": 1620 + }, + { + "epoch": 0.4284490145672665, + "grad_norm": 0.3065454065799713, + "learning_rate": 0.0001572502968729384, + "loss": 0.0738, + "step": 1625 + }, + { + "epoch": 0.42976731922747347, + "grad_norm": 0.09581473469734192, + "learning_rate": 0.00015711835334476847, + "loss": 0.0571, + "step": 1630 + }, + { + "epoch": 0.43108562388768046, + "grad_norm": 0.23746591806411743, + "learning_rate": 0.0001569864098165985, + "loss": 0.0128, + "step": 1635 + }, + { + "epoch": 0.4324039285478874, + "grad_norm": 0.936278760433197, + "learning_rate": 0.00015685446628842858, + "loss": 0.0665, + "step": 1640 + }, + { + "epoch": 0.4337222332080944, + "grad_norm": 0.18487441539764404, + "learning_rate": 0.00015672252276025862, + "loss": 0.0527, + "step": 1645 + }, + { + "epoch": 0.4350405378683014, + "grad_norm": 0.6980624794960022, + "learning_rate": 0.00015659057923208866, + "loss": 0.0613, + "step": 1650 + }, + { + "epoch": 0.4363588425285083, + "grad_norm": 0.4696301221847534, + "learning_rate": 0.00015645863570391873, + "loss": 0.0569, + "step": 1655 + }, + { + "epoch": 0.4376771471887153, + "grad_norm": 0.15083105862140656, + "learning_rate": 0.00015632669217574877, + "loss": 0.0394, + "step": 1660 + }, + { + "epoch": 0.4389954518489223, + "grad_norm": 0.44701239466667175, + "learning_rate": 0.00015619474864757884, + "loss": 0.0494, + "step": 1665 + }, + { + "epoch": 0.44031375650912924, + "grad_norm": 0.07418403029441833, + "learning_rate": 0.00015606280511940888, + "loss": 0.0291, + "step": 1670 + }, + { + "epoch": 0.44163206116933623, + "grad_norm": 0.02311861515045166, + "learning_rate": 0.00015593086159123895, + "loss": 0.0304, + "step": 1675 + }, + { + "epoch": 0.4429503658295432, + "grad_norm": 0.4416038990020752, + "learning_rate": 0.00015579891806306902, + "loss": 0.0176, + "step": 1680 + }, + { + "epoch": 0.44426867048975016, + "grad_norm": 0.5124915242195129, + "learning_rate": 0.00015566697453489906, + "loss": 0.0454, + "step": 1685 + }, + { + "epoch": 0.44558697514995715, + "grad_norm": 0.3159286081790924, + "learning_rate": 0.00015553503100672913, + "loss": 0.047, + "step": 1690 + }, + { + "epoch": 0.44690527981016415, + "grad_norm": 0.032126396894454956, + "learning_rate": 0.00015540308747855918, + "loss": 0.0151, + "step": 1695 + }, + { + "epoch": 0.4482235844703711, + "grad_norm": 0.04663548618555069, + "learning_rate": 0.00015527114395038924, + "loss": 0.0375, + "step": 1700 + }, + { + "epoch": 0.4495418891305781, + "grad_norm": 0.013753900304436684, + "learning_rate": 0.0001551392004222193, + "loss": 0.0485, + "step": 1705 + }, + { + "epoch": 0.45086019379078507, + "grad_norm": 1.9952393770217896, + "learning_rate": 0.00015500725689404936, + "loss": 0.0625, + "step": 1710 + }, + { + "epoch": 0.452178498450992, + "grad_norm": 0.014283270575106144, + "learning_rate": 0.00015487531336587943, + "loss": 0.0037, + "step": 1715 + }, + { + "epoch": 0.453496803111199, + "grad_norm": 0.3897913098335266, + "learning_rate": 0.00015474336983770947, + "loss": 0.0304, + "step": 1720 + }, + { + "epoch": 0.454815107771406, + "grad_norm": 0.3730885684490204, + "learning_rate": 0.00015461142630953954, + "loss": 0.0115, + "step": 1725 + }, + { + "epoch": 0.45613341243161293, + "grad_norm": 0.035858724266290665, + "learning_rate": 0.00015447948278136958, + "loss": 0.0021, + "step": 1730 + }, + { + "epoch": 0.4574517170918199, + "grad_norm": 0.20589517056941986, + "learning_rate": 0.00015434753925319965, + "loss": 0.0132, + "step": 1735 + }, + { + "epoch": 0.4587700217520269, + "grad_norm": 0.004939342383295298, + "learning_rate": 0.00015421559572502972, + "loss": 0.0471, + "step": 1740 + }, + { + "epoch": 0.46008832641223385, + "grad_norm": 0.03493283689022064, + "learning_rate": 0.00015408365219685976, + "loss": 0.0062, + "step": 1745 + }, + { + "epoch": 0.46140663107244084, + "grad_norm": 0.045927103608846664, + "learning_rate": 0.0001539517086686898, + "loss": 0.0283, + "step": 1750 + }, + { + "epoch": 0.46272493573264784, + "grad_norm": 0.012629454955458641, + "learning_rate": 0.00015381976514051984, + "loss": 0.0133, + "step": 1755 + }, + { + "epoch": 0.46404324039285477, + "grad_norm": 0.8001697659492493, + "learning_rate": 0.0001536878216123499, + "loss": 0.0224, + "step": 1760 + }, + { + "epoch": 0.46536154505306176, + "grad_norm": 0.002036362886428833, + "learning_rate": 0.00015355587808417998, + "loss": 0.0066, + "step": 1765 + }, + { + "epoch": 0.46667984971326876, + "grad_norm": 1.0261330604553223, + "learning_rate": 0.00015342393455601002, + "loss": 0.191, + "step": 1770 + }, + { + "epoch": 0.4679981543734757, + "grad_norm": 0.3033429682254791, + "learning_rate": 0.0001532919910278401, + "loss": 0.0222, + "step": 1775 + }, + { + "epoch": 0.4693164590336827, + "grad_norm": 0.36911338567733765, + "learning_rate": 0.00015316004749967014, + "loss": 0.0363, + "step": 1780 + }, + { + "epoch": 0.4706347636938897, + "grad_norm": 0.0406811460852623, + "learning_rate": 0.0001530281039715002, + "loss": 0.0283, + "step": 1785 + }, + { + "epoch": 0.4719530683540966, + "grad_norm": 0.23334211111068726, + "learning_rate": 0.00015289616044333027, + "loss": 0.0274, + "step": 1790 + }, + { + "epoch": 0.4732713730143036, + "grad_norm": 0.013081169687211514, + "learning_rate": 0.00015276421691516032, + "loss": 0.0221, + "step": 1795 + }, + { + "epoch": 0.4745896776745106, + "grad_norm": 0.2480790615081787, + "learning_rate": 0.00015263227338699039, + "loss": 0.019, + "step": 1800 + }, + { + "epoch": 0.47590798233471754, + "grad_norm": 0.0373196005821228, + "learning_rate": 0.00015250032985882043, + "loss": 0.0292, + "step": 1805 + }, + { + "epoch": 0.47722628699492453, + "grad_norm": 0.004609994124621153, + "learning_rate": 0.0001523683863306505, + "loss": 0.0918, + "step": 1810 + }, + { + "epoch": 0.4785445916551315, + "grad_norm": 0.02370987832546234, + "learning_rate": 0.00015223644280248054, + "loss": 0.0462, + "step": 1815 + }, + { + "epoch": 0.47986289631533846, + "grad_norm": 0.05842221528291702, + "learning_rate": 0.0001521044992743106, + "loss": 0.0595, + "step": 1820 + }, + { + "epoch": 0.48118120097554545, + "grad_norm": 0.009685276076197624, + "learning_rate": 0.00015197255574614068, + "loss": 0.0074, + "step": 1825 + }, + { + "epoch": 0.48249950563575245, + "grad_norm": 0.8933250308036804, + "learning_rate": 0.00015184061221797072, + "loss": 0.0757, + "step": 1830 + }, + { + "epoch": 0.4838178102959594, + "grad_norm": 0.07075401395559311, + "learning_rate": 0.0001517086686898008, + "loss": 0.0226, + "step": 1835 + }, + { + "epoch": 0.4851361149561664, + "grad_norm": 0.732706606388092, + "learning_rate": 0.00015157672516163083, + "loss": 0.0161, + "step": 1840 + }, + { + "epoch": 0.48645441961637337, + "grad_norm": 1.1897023916244507, + "learning_rate": 0.0001514447816334609, + "loss": 0.0265, + "step": 1845 + }, + { + "epoch": 0.4877727242765803, + "grad_norm": 0.052572328597307205, + "learning_rate": 0.00015131283810529094, + "loss": 0.0094, + "step": 1850 + }, + { + "epoch": 0.4890910289367873, + "grad_norm": 0.08263898640871048, + "learning_rate": 0.00015118089457712098, + "loss": 0.0631, + "step": 1855 + }, + { + "epoch": 0.4904093335969943, + "grad_norm": 0.03225664421916008, + "learning_rate": 0.00015104895104895105, + "loss": 0.023, + "step": 1860 + }, + { + "epoch": 0.4917276382572012, + "grad_norm": 0.007935039699077606, + "learning_rate": 0.0001509170075207811, + "loss": 0.0039, + "step": 1865 + }, + { + "epoch": 0.4930459429174082, + "grad_norm": 0.00830796267837286, + "learning_rate": 0.00015078506399261116, + "loss": 0.007, + "step": 1870 + }, + { + "epoch": 0.4943642475776152, + "grad_norm": 0.08042234182357788, + "learning_rate": 0.00015065312046444123, + "loss": 0.0366, + "step": 1875 + }, + { + "epoch": 0.49568255223782215, + "grad_norm": 0.009092851541936398, + "learning_rate": 0.00015052117693627128, + "loss": 0.0107, + "step": 1880 + }, + { + "epoch": 0.49700085689802914, + "grad_norm": 0.2674141824245453, + "learning_rate": 0.00015038923340810135, + "loss": 0.0076, + "step": 1885 + }, + { + "epoch": 0.49831916155823613, + "grad_norm": 0.07694366574287415, + "learning_rate": 0.0001502572898799314, + "loss": 0.0252, + "step": 1890 + }, + { + "epoch": 0.49963746621844307, + "grad_norm": 0.5699467062950134, + "learning_rate": 0.00015012534635176146, + "loss": 0.0487, + "step": 1895 + }, + { + "epoch": 0.5009557708786501, + "grad_norm": 0.18800878524780273, + "learning_rate": 0.0001499934028235915, + "loss": 0.0183, + "step": 1900 + }, + { + "epoch": 0.5022740755388571, + "grad_norm": 0.019469989463686943, + "learning_rate": 0.00014986145929542157, + "loss": 0.0268, + "step": 1905 + }, + { + "epoch": 0.503592380199064, + "grad_norm": 0.01890506222844124, + "learning_rate": 0.00014972951576725164, + "loss": 0.0449, + "step": 1910 + }, + { + "epoch": 0.5049106848592709, + "grad_norm": 0.0006314461352303624, + "learning_rate": 0.00014959757223908168, + "loss": 0.0056, + "step": 1915 + }, + { + "epoch": 0.5062289895194779, + "grad_norm": 0.32654041051864624, + "learning_rate": 0.00014946562871091175, + "loss": 0.0256, + "step": 1920 + }, + { + "epoch": 0.5075472941796849, + "grad_norm": 0.7803483605384827, + "learning_rate": 0.0001493336851827418, + "loss": 0.0374, + "step": 1925 + }, + { + "epoch": 0.5088655988398919, + "grad_norm": 0.028441445901989937, + "learning_rate": 0.00014920174165457186, + "loss": 0.0161, + "step": 1930 + }, + { + "epoch": 0.5101839035000989, + "grad_norm": 0.028379200026392937, + "learning_rate": 0.00014906979812640193, + "loss": 0.0151, + "step": 1935 + }, + { + "epoch": 0.5115022081603059, + "grad_norm": 0.021159596741199493, + "learning_rate": 0.00014893785459823197, + "loss": 0.0303, + "step": 1940 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.24903325736522675, + "learning_rate": 0.000148805911070062, + "loss": 0.0076, + "step": 1945 + }, + { + "epoch": 0.5141388174807198, + "grad_norm": 0.007065301761031151, + "learning_rate": 0.00014867396754189206, + "loss": 0.022, + "step": 1950 + }, + { + "epoch": 0.5154571221409268, + "grad_norm": 0.004032329190522432, + "learning_rate": 0.00014854202401372212, + "loss": 0.0083, + "step": 1955 + }, + { + "epoch": 0.5167754268011338, + "grad_norm": 0.3045775592327118, + "learning_rate": 0.0001484100804855522, + "loss": 0.0113, + "step": 1960 + }, + { + "epoch": 0.5180937314613407, + "grad_norm": 0.36974939703941345, + "learning_rate": 0.00014827813695738224, + "loss": 0.0267, + "step": 1965 + }, + { + "epoch": 0.5194120361215477, + "grad_norm": 0.009729950688779354, + "learning_rate": 0.0001481461934292123, + "loss": 0.027, + "step": 1970 + }, + { + "epoch": 0.5207303407817546, + "grad_norm": 0.0013097926275804639, + "learning_rate": 0.00014801424990104235, + "loss": 0.003, + "step": 1975 + }, + { + "epoch": 0.5220486454419616, + "grad_norm": 0.0706263929605484, + "learning_rate": 0.00014788230637287242, + "loss": 0.0193, + "step": 1980 + }, + { + "epoch": 0.5233669501021686, + "grad_norm": 1.435702919960022, + "learning_rate": 0.00014775036284470249, + "loss": 0.0647, + "step": 1985 + }, + { + "epoch": 0.5246852547623756, + "grad_norm": 0.00661757867783308, + "learning_rate": 0.00014761841931653253, + "loss": 0.0373, + "step": 1990 + }, + { + "epoch": 0.5260035594225826, + "grad_norm": 0.12014541029930115, + "learning_rate": 0.0001474864757883626, + "loss": 0.0178, + "step": 1995 + }, + { + "epoch": 0.5273218640827896, + "grad_norm": 1.0549248456954956, + "learning_rate": 0.00014735453226019264, + "loss": 0.0191, + "step": 2000 + }, + { + "epoch": 0.5273218640827896, + "eval_loss": 0.037292081862688065, + "eval_runtime": 454.3033, + "eval_samples_per_second": 7.422, + "eval_steps_per_second": 3.711, + "step": 2000 + }, + { + "epoch": 0.5286401687429965, + "grad_norm": 0.47634151577949524, + "learning_rate": 0.0001472225887320227, + "loss": 0.0404, + "step": 2005 + }, + { + "epoch": 0.5299584734032035, + "grad_norm": 0.006752463988959789, + "learning_rate": 0.00014709064520385275, + "loss": 0.034, + "step": 2010 + }, + { + "epoch": 0.5312767780634104, + "grad_norm": 0.20780125260353088, + "learning_rate": 0.00014695870167568282, + "loss": 0.0421, + "step": 2015 + }, + { + "epoch": 0.5325950827236174, + "grad_norm": 0.010941066779196262, + "learning_rate": 0.0001468267581475129, + "loss": 0.0086, + "step": 2020 + }, + { + "epoch": 0.5339133873838244, + "grad_norm": 0.3439581096172333, + "learning_rate": 0.00014669481461934293, + "loss": 0.0187, + "step": 2025 + }, + { + "epoch": 0.5352316920440314, + "grad_norm": 0.14961636066436768, + "learning_rate": 0.000146562871091173, + "loss": 0.0504, + "step": 2030 + }, + { + "epoch": 0.5365499967042383, + "grad_norm": 0.0044641937129199505, + "learning_rate": 0.00014643092756300304, + "loss": 0.0134, + "step": 2035 + }, + { + "epoch": 0.5378683013644453, + "grad_norm": 0.14088386297225952, + "learning_rate": 0.0001462989840348331, + "loss": 0.0096, + "step": 2040 + }, + { + "epoch": 0.5391866060246523, + "grad_norm": 0.48116979002952576, + "learning_rate": 0.00014616704050666315, + "loss": 0.0124, + "step": 2045 + }, + { + "epoch": 0.5405049106848593, + "grad_norm": 0.3688766360282898, + "learning_rate": 0.0001460350969784932, + "loss": 0.0226, + "step": 2050 + }, + { + "epoch": 0.5418232153450663, + "grad_norm": 0.002938181860372424, + "learning_rate": 0.00014590315345032326, + "loss": 0.0267, + "step": 2055 + }, + { + "epoch": 0.5431415200052733, + "grad_norm": 0.3335214853286743, + "learning_rate": 0.0001457712099221533, + "loss": 0.0367, + "step": 2060 + }, + { + "epoch": 0.5444598246654802, + "grad_norm": 0.004644686821848154, + "learning_rate": 0.00014563926639398338, + "loss": 0.0121, + "step": 2065 + }, + { + "epoch": 0.5457781293256871, + "grad_norm": 0.19505545496940613, + "learning_rate": 0.00014550732286581345, + "loss": 0.0591, + "step": 2070 + }, + { + "epoch": 0.5470964339858941, + "grad_norm": 0.018028756603598595, + "learning_rate": 0.0001453753793376435, + "loss": 0.0131, + "step": 2075 + }, + { + "epoch": 0.5484147386461011, + "grad_norm": 0.045639291405677795, + "learning_rate": 0.00014524343580947356, + "loss": 0.0443, + "step": 2080 + }, + { + "epoch": 0.5497330433063081, + "grad_norm": 0.727981686592102, + "learning_rate": 0.0001451114922813036, + "loss": 0.0205, + "step": 2085 + }, + { + "epoch": 0.5510513479665151, + "grad_norm": 0.03766491636633873, + "learning_rate": 0.00014497954875313367, + "loss": 0.0067, + "step": 2090 + }, + { + "epoch": 0.552369652626722, + "grad_norm": 0.1911504715681076, + "learning_rate": 0.0001448476052249637, + "loss": 0.0397, + "step": 2095 + }, + { + "epoch": 0.553687957286929, + "grad_norm": 0.08238353580236435, + "learning_rate": 0.00014471566169679378, + "loss": 0.0513, + "step": 2100 + }, + { + "epoch": 0.555006261947136, + "grad_norm": 0.06317206472158432, + "learning_rate": 0.00014458371816862385, + "loss": 0.0178, + "step": 2105 + }, + { + "epoch": 0.556324566607343, + "grad_norm": 0.0652734637260437, + "learning_rate": 0.0001444517746404539, + "loss": 0.0184, + "step": 2110 + }, + { + "epoch": 0.55764287126755, + "grad_norm": 0.05471858009696007, + "learning_rate": 0.00014431983111228396, + "loss": 0.0089, + "step": 2115 + }, + { + "epoch": 0.558961175927757, + "grad_norm": 0.005062670446932316, + "learning_rate": 0.000144187887584114, + "loss": 0.0052, + "step": 2120 + }, + { + "epoch": 0.5602794805879638, + "grad_norm": 0.06337414681911469, + "learning_rate": 0.00014405594405594407, + "loss": 0.053, + "step": 2125 + }, + { + "epoch": 0.5615977852481708, + "grad_norm": 0.33745357394218445, + "learning_rate": 0.00014392400052777414, + "loss": 0.0166, + "step": 2130 + }, + { + "epoch": 0.5629160899083778, + "grad_norm": 0.7382741570472717, + "learning_rate": 0.00014379205699960418, + "loss": 0.0191, + "step": 2135 + }, + { + "epoch": 0.5642343945685848, + "grad_norm": 0.007551972754299641, + "learning_rate": 0.00014366011347143425, + "loss": 0.0022, + "step": 2140 + }, + { + "epoch": 0.5655526992287918, + "grad_norm": 0.6260896921157837, + "learning_rate": 0.00014352816994326427, + "loss": 0.0095, + "step": 2145 + }, + { + "epoch": 0.5668710038889987, + "grad_norm": 0.11619322001934052, + "learning_rate": 0.00014339622641509434, + "loss": 0.015, + "step": 2150 + }, + { + "epoch": 0.5681893085492057, + "grad_norm": 1.1440670490264893, + "learning_rate": 0.0001432642828869244, + "loss": 0.1343, + "step": 2155 + }, + { + "epoch": 0.5695076132094127, + "grad_norm": 1.1793878078460693, + "learning_rate": 0.00014313233935875445, + "loss": 0.0968, + "step": 2160 + }, + { + "epoch": 0.5708259178696197, + "grad_norm": 0.6865736842155457, + "learning_rate": 0.00014300039583058452, + "loss": 0.0195, + "step": 2165 + }, + { + "epoch": 0.5721442225298267, + "grad_norm": 0.140816792845726, + "learning_rate": 0.00014286845230241456, + "loss": 0.0761, + "step": 2170 + }, + { + "epoch": 0.5734625271900337, + "grad_norm": 0.04071786254644394, + "learning_rate": 0.00014273650877424463, + "loss": 0.0193, + "step": 2175 + }, + { + "epoch": 0.5747808318502405, + "grad_norm": 0.044617727398872375, + "learning_rate": 0.0001426045652460747, + "loss": 0.0112, + "step": 2180 + }, + { + "epoch": 0.5760991365104475, + "grad_norm": 0.11001799255609512, + "learning_rate": 0.00014247262171790474, + "loss": 0.0039, + "step": 2185 + }, + { + "epoch": 0.5774174411706545, + "grad_norm": 0.0036315324250608683, + "learning_rate": 0.0001423406781897348, + "loss": 0.0038, + "step": 2190 + }, + { + "epoch": 0.5787357458308615, + "grad_norm": 0.9866570830345154, + "learning_rate": 0.00014220873466156485, + "loss": 0.025, + "step": 2195 + }, + { + "epoch": 0.5800540504910685, + "grad_norm": 0.023570384830236435, + "learning_rate": 0.00014207679113339492, + "loss": 0.0468, + "step": 2200 + }, + { + "epoch": 0.5813723551512755, + "grad_norm": 0.20010559260845184, + "learning_rate": 0.00014194484760522496, + "loss": 0.0198, + "step": 2205 + }, + { + "epoch": 0.5826906598114824, + "grad_norm": 0.06153270602226257, + "learning_rate": 0.00014181290407705503, + "loss": 0.0764, + "step": 2210 + }, + { + "epoch": 0.5840089644716894, + "grad_norm": 0.033162448555231094, + "learning_rate": 0.0001416809605488851, + "loss": 0.028, + "step": 2215 + }, + { + "epoch": 0.5853272691318964, + "grad_norm": 0.428382933139801, + "learning_rate": 0.00014154901702071514, + "loss": 0.0652, + "step": 2220 + }, + { + "epoch": 0.5866455737921034, + "grad_norm": 0.25004762411117554, + "learning_rate": 0.0001414170734925452, + "loss": 0.0411, + "step": 2225 + }, + { + "epoch": 0.5879638784523104, + "grad_norm": 0.22649863362312317, + "learning_rate": 0.00014128512996437525, + "loss": 0.0517, + "step": 2230 + }, + { + "epoch": 0.5892821831125173, + "grad_norm": 0.035932112485170364, + "learning_rate": 0.00014115318643620532, + "loss": 0.015, + "step": 2235 + }, + { + "epoch": 0.5906004877727242, + "grad_norm": 0.3800172507762909, + "learning_rate": 0.00014102124290803536, + "loss": 0.0324, + "step": 2240 + }, + { + "epoch": 0.5919187924329312, + "grad_norm": 0.6974118947982788, + "learning_rate": 0.0001408892993798654, + "loss": 0.0216, + "step": 2245 + }, + { + "epoch": 0.5932370970931382, + "grad_norm": 0.15472032129764557, + "learning_rate": 0.00014075735585169548, + "loss": 0.0164, + "step": 2250 + }, + { + "epoch": 0.5945554017533452, + "grad_norm": 0.015000814571976662, + "learning_rate": 0.00014062541232352552, + "loss": 0.0395, + "step": 2255 + }, + { + "epoch": 0.5958737064135522, + "grad_norm": 0.052086081355810165, + "learning_rate": 0.0001404934687953556, + "loss": 0.0032, + "step": 2260 + }, + { + "epoch": 0.5971920110737592, + "grad_norm": 0.004600350745022297, + "learning_rate": 0.00014036152526718566, + "loss": 0.0056, + "step": 2265 + }, + { + "epoch": 0.5985103157339661, + "grad_norm": 0.4940958321094513, + "learning_rate": 0.0001402295817390157, + "loss": 0.0206, + "step": 2270 + }, + { + "epoch": 0.5998286203941731, + "grad_norm": 0.09658394008874893, + "learning_rate": 0.00014009763821084577, + "loss": 0.0052, + "step": 2275 + }, + { + "epoch": 0.60114692505438, + "grad_norm": 0.00020539117394946516, + "learning_rate": 0.0001399656946826758, + "loss": 0.087, + "step": 2280 + }, + { + "epoch": 0.602465229714587, + "grad_norm": 0.1871018409729004, + "learning_rate": 0.00013983375115450588, + "loss": 0.0812, + "step": 2285 + }, + { + "epoch": 0.603783534374794, + "grad_norm": 0.02583954855799675, + "learning_rate": 0.00013970180762633592, + "loss": 0.0232, + "step": 2290 + }, + { + "epoch": 0.605101839035001, + "grad_norm": 1.2103784084320068, + "learning_rate": 0.000139569864098166, + "loss": 0.0151, + "step": 2295 + }, + { + "epoch": 0.6064201436952079, + "grad_norm": 0.023514943197369576, + "learning_rate": 0.00013943792056999606, + "loss": 0.0193, + "step": 2300 + }, + { + "epoch": 0.6077384483554149, + "grad_norm": 0.0076395305804908276, + "learning_rate": 0.0001393059770418261, + "loss": 0.0379, + "step": 2305 + }, + { + "epoch": 0.6090567530156219, + "grad_norm": 0.12412039190530777, + "learning_rate": 0.00013917403351365617, + "loss": 0.0095, + "step": 2310 + }, + { + "epoch": 0.6103750576758289, + "grad_norm": 0.021904783323407173, + "learning_rate": 0.0001390420899854862, + "loss": 0.0166, + "step": 2315 + }, + { + "epoch": 0.6116933623360359, + "grad_norm": 0.004012851510196924, + "learning_rate": 0.00013891014645731628, + "loss": 0.0103, + "step": 2320 + }, + { + "epoch": 0.6130116669962429, + "grad_norm": 0.007267913781106472, + "learning_rate": 0.00013877820292914635, + "loss": 0.0708, + "step": 2325 + }, + { + "epoch": 0.6143299716564498, + "grad_norm": 0.10363642126321793, + "learning_rate": 0.0001386462594009764, + "loss": 0.0473, + "step": 2330 + }, + { + "epoch": 0.6156482763166568, + "grad_norm": 0.04899830371141434, + "learning_rate": 0.00013851431587280646, + "loss": 0.0283, + "step": 2335 + }, + { + "epoch": 0.6169665809768637, + "grad_norm": 0.39460498094558716, + "learning_rate": 0.0001383823723446365, + "loss": 0.0597, + "step": 2340 + }, + { + "epoch": 0.6182848856370707, + "grad_norm": 0.04092290997505188, + "learning_rate": 0.00013825042881646655, + "loss": 0.0167, + "step": 2345 + }, + { + "epoch": 0.6196031902972777, + "grad_norm": 0.2781132161617279, + "learning_rate": 0.00013811848528829662, + "loss": 0.0097, + "step": 2350 + }, + { + "epoch": 0.6209214949574847, + "grad_norm": 0.041443537920713425, + "learning_rate": 0.00013798654176012666, + "loss": 0.0226, + "step": 2355 + }, + { + "epoch": 0.6222397996176916, + "grad_norm": 0.1242462694644928, + "learning_rate": 0.00013785459823195673, + "loss": 0.0055, + "step": 2360 + }, + { + "epoch": 0.6235581042778986, + "grad_norm": 0.4440467357635498, + "learning_rate": 0.00013772265470378677, + "loss": 0.049, + "step": 2365 + }, + { + "epoch": 0.6248764089381056, + "grad_norm": 0.014354427345097065, + "learning_rate": 0.00013759071117561684, + "loss": 0.0327, + "step": 2370 + }, + { + "epoch": 0.6261947135983126, + "grad_norm": 0.011539973318576813, + "learning_rate": 0.0001374587676474469, + "loss": 0.0222, + "step": 2375 + }, + { + "epoch": 0.6275130182585196, + "grad_norm": 0.23539051413536072, + "learning_rate": 0.00013732682411927695, + "loss": 0.0816, + "step": 2380 + }, + { + "epoch": 0.6288313229187266, + "grad_norm": 0.26793941855430603, + "learning_rate": 0.00013719488059110702, + "loss": 0.0325, + "step": 2385 + }, + { + "epoch": 0.6301496275789334, + "grad_norm": 0.01662217453122139, + "learning_rate": 0.00013706293706293706, + "loss": 0.0221, + "step": 2390 + }, + { + "epoch": 0.6314679322391404, + "grad_norm": 0.30669671297073364, + "learning_rate": 0.00013693099353476713, + "loss": 0.026, + "step": 2395 + }, + { + "epoch": 0.6327862368993474, + "grad_norm": 0.03350894898176193, + "learning_rate": 0.00013679905000659717, + "loss": 0.0072, + "step": 2400 + }, + { + "epoch": 0.6341045415595544, + "grad_norm": 0.014983875676989555, + "learning_rate": 0.00013666710647842724, + "loss": 0.049, + "step": 2405 + }, + { + "epoch": 0.6354228462197614, + "grad_norm": 1.8989384174346924, + "learning_rate": 0.0001365351629502573, + "loss": 0.0335, + "step": 2410 + }, + { + "epoch": 0.6367411508799684, + "grad_norm": 0.030135562643408775, + "learning_rate": 0.00013640321942208735, + "loss": 0.0051, + "step": 2415 + }, + { + "epoch": 0.6380594555401753, + "grad_norm": 0.02079075388610363, + "learning_rate": 0.00013627127589391742, + "loss": 0.0138, + "step": 2420 + }, + { + "epoch": 0.6393777602003823, + "grad_norm": 0.06065403297543526, + "learning_rate": 0.00013613933236574746, + "loss": 0.0357, + "step": 2425 + }, + { + "epoch": 0.6406960648605893, + "grad_norm": 0.2980937659740448, + "learning_rate": 0.00013600738883757753, + "loss": 0.0138, + "step": 2430 + }, + { + "epoch": 0.6420143695207963, + "grad_norm": 0.4820438623428345, + "learning_rate": 0.00013587544530940758, + "loss": 0.01, + "step": 2435 + }, + { + "epoch": 0.6433326741810033, + "grad_norm": 0.005618259310722351, + "learning_rate": 0.00013574350178123765, + "loss": 0.0052, + "step": 2440 + }, + { + "epoch": 0.6446509788412103, + "grad_norm": 0.7173821926116943, + "learning_rate": 0.0001356115582530677, + "loss": 0.0133, + "step": 2445 + }, + { + "epoch": 0.6459692835014171, + "grad_norm": 0.0053142281249165535, + "learning_rate": 0.00013547961472489773, + "loss": 0.0045, + "step": 2450 + }, + { + "epoch": 0.6472875881616241, + "grad_norm": 0.06118829548358917, + "learning_rate": 0.0001353476711967278, + "loss": 0.056, + "step": 2455 + }, + { + "epoch": 0.6486058928218311, + "grad_norm": 3.5878078937530518, + "learning_rate": 0.00013521572766855787, + "loss": 0.0232, + "step": 2460 + }, + { + "epoch": 0.6499241974820381, + "grad_norm": 0.004911276511847973, + "learning_rate": 0.0001350837841403879, + "loss": 0.0074, + "step": 2465 + }, + { + "epoch": 0.6512425021422451, + "grad_norm": 0.0028026222717016935, + "learning_rate": 0.00013495184061221798, + "loss": 0.0782, + "step": 2470 + }, + { + "epoch": 0.6525608068024521, + "grad_norm": 0.7317615747451782, + "learning_rate": 0.00013481989708404802, + "loss": 0.0222, + "step": 2475 + }, + { + "epoch": 0.653879111462659, + "grad_norm": 0.01835751160979271, + "learning_rate": 0.0001346879535558781, + "loss": 0.0661, + "step": 2480 + }, + { + "epoch": 0.655197416122866, + "grad_norm": 0.03598962351679802, + "learning_rate": 0.00013455601002770813, + "loss": 0.0395, + "step": 2485 + }, + { + "epoch": 0.656515720783073, + "grad_norm": 0.013886351138353348, + "learning_rate": 0.0001344240664995382, + "loss": 0.0156, + "step": 2490 + }, + { + "epoch": 0.65783402544328, + "grad_norm": 5.741530895233154, + "learning_rate": 0.00013429212297136827, + "loss": 0.0317, + "step": 2495 + }, + { + "epoch": 0.659152330103487, + "grad_norm": 0.20793496072292328, + "learning_rate": 0.0001341601794431983, + "loss": 0.0072, + "step": 2500 + }, + { + "epoch": 0.659152330103487, + "eval_loss": 0.0300898440182209, + "eval_runtime": 453.0554, + "eval_samples_per_second": 7.443, + "eval_steps_per_second": 3.721, + "step": 2500 + }, + { + "epoch": 0.6604706347636939, + "grad_norm": 0.03460961952805519, + "learning_rate": 0.00013402823591502838, + "loss": 0.0097, + "step": 2505 + }, + { + "epoch": 0.6617889394239008, + "grad_norm": 0.31785696744918823, + "learning_rate": 0.00013389629238685842, + "loss": 0.0303, + "step": 2510 + }, + { + "epoch": 0.6631072440841078, + "grad_norm": 0.4273851215839386, + "learning_rate": 0.0001337643488586885, + "loss": 0.0499, + "step": 2515 + }, + { + "epoch": 0.6644255487443148, + "grad_norm": 0.02236153744161129, + "learning_rate": 0.00013363240533051856, + "loss": 0.0069, + "step": 2520 + }, + { + "epoch": 0.6657438534045218, + "grad_norm": 0.1592864990234375, + "learning_rate": 0.0001335004618023486, + "loss": 0.0326, + "step": 2525 + }, + { + "epoch": 0.6670621580647288, + "grad_norm": 0.029961545020341873, + "learning_rate": 0.00013336851827417867, + "loss": 0.0178, + "step": 2530 + }, + { + "epoch": 0.6683804627249358, + "grad_norm": 0.03120764158666134, + "learning_rate": 0.00013323657474600872, + "loss": 0.115, + "step": 2535 + }, + { + "epoch": 0.6696987673851427, + "grad_norm": 0.01060028001666069, + "learning_rate": 0.00013310463121783879, + "loss": 0.0036, + "step": 2540 + }, + { + "epoch": 0.6710170720453497, + "grad_norm": 0.053470809012651443, + "learning_rate": 0.00013297268768966883, + "loss": 0.0079, + "step": 2545 + }, + { + "epoch": 0.6723353767055567, + "grad_norm": 0.022777097299695015, + "learning_rate": 0.00013284074416149887, + "loss": 0.0078, + "step": 2550 + }, + { + "epoch": 0.6736536813657636, + "grad_norm": 0.0548521913588047, + "learning_rate": 0.00013270880063332894, + "loss": 0.0503, + "step": 2555 + }, + { + "epoch": 0.6749719860259706, + "grad_norm": 0.02028457075357437, + "learning_rate": 0.00013257685710515898, + "loss": 0.0096, + "step": 2560 + }, + { + "epoch": 0.6762902906861776, + "grad_norm": 0.01569107361137867, + "learning_rate": 0.00013244491357698905, + "loss": 0.008, + "step": 2565 + }, + { + "epoch": 0.6776085953463845, + "grad_norm": 0.00743742985650897, + "learning_rate": 0.00013231297004881912, + "loss": 0.005, + "step": 2570 + }, + { + "epoch": 0.6789269000065915, + "grad_norm": 0.025164416059851646, + "learning_rate": 0.00013218102652064916, + "loss": 0.018, + "step": 2575 + }, + { + "epoch": 0.6802452046667985, + "grad_norm": 0.3653188645839691, + "learning_rate": 0.00013204908299247923, + "loss": 0.0295, + "step": 2580 + }, + { + "epoch": 0.6815635093270055, + "grad_norm": 0.685422956943512, + "learning_rate": 0.00013191713946430927, + "loss": 0.0335, + "step": 2585 + }, + { + "epoch": 0.6828818139872125, + "grad_norm": 0.675740122795105, + "learning_rate": 0.00013178519593613934, + "loss": 0.0592, + "step": 2590 + }, + { + "epoch": 0.6842001186474194, + "grad_norm": 0.10513252764940262, + "learning_rate": 0.00013165325240796938, + "loss": 0.0353, + "step": 2595 + }, + { + "epoch": 0.6855184233076264, + "grad_norm": 0.43512973189353943, + "learning_rate": 0.00013152130887979945, + "loss": 0.0142, + "step": 2600 + }, + { + "epoch": 0.6868367279678333, + "grad_norm": 0.029436839744448662, + "learning_rate": 0.00013138936535162952, + "loss": 0.0042, + "step": 2605 + }, + { + "epoch": 0.6881550326280403, + "grad_norm": 0.5607122778892517, + "learning_rate": 0.00013125742182345957, + "loss": 0.0184, + "step": 2610 + }, + { + "epoch": 0.6894733372882473, + "grad_norm": 0.11365406215190887, + "learning_rate": 0.00013112547829528963, + "loss": 0.006, + "step": 2615 + }, + { + "epoch": 0.6907916419484543, + "grad_norm": 0.047227244824171066, + "learning_rate": 0.00013099353476711968, + "loss": 0.008, + "step": 2620 + }, + { + "epoch": 0.6921099466086612, + "grad_norm": 0.0005877618095837533, + "learning_rate": 0.00013086159123894975, + "loss": 0.0286, + "step": 2625 + }, + { + "epoch": 0.6934282512688682, + "grad_norm": 0.010759112425148487, + "learning_rate": 0.0001307296477107798, + "loss": 0.0062, + "step": 2630 + }, + { + "epoch": 0.6947465559290752, + "grad_norm": 0.07117745280265808, + "learning_rate": 0.00013059770418260986, + "loss": 0.0891, + "step": 2635 + }, + { + "epoch": 0.6960648605892822, + "grad_norm": 0.0639057606458664, + "learning_rate": 0.00013046576065443993, + "loss": 0.0072, + "step": 2640 + }, + { + "epoch": 0.6973831652494892, + "grad_norm": 0.027350090444087982, + "learning_rate": 0.00013033381712626994, + "loss": 0.0103, + "step": 2645 + }, + { + "epoch": 0.6987014699096962, + "grad_norm": 0.015336195938289165, + "learning_rate": 0.0001302018735981, + "loss": 0.0041, + "step": 2650 + }, + { + "epoch": 0.700019774569903, + "grad_norm": 1.0650830268859863, + "learning_rate": 0.00013006993006993008, + "loss": 0.0443, + "step": 2655 + }, + { + "epoch": 0.70133807923011, + "grad_norm": 0.019073212519288063, + "learning_rate": 0.00012993798654176012, + "loss": 0.0331, + "step": 2660 + }, + { + "epoch": 0.702656383890317, + "grad_norm": 0.10109209269285202, + "learning_rate": 0.0001298060430135902, + "loss": 0.0054, + "step": 2665 + }, + { + "epoch": 0.703974688550524, + "grad_norm": 0.03528957813978195, + "learning_rate": 0.00012967409948542023, + "loss": 0.0427, + "step": 2670 + }, + { + "epoch": 0.705292993210731, + "grad_norm": 0.03577788919210434, + "learning_rate": 0.0001295421559572503, + "loss": 0.023, + "step": 2675 + }, + { + "epoch": 0.706611297870938, + "grad_norm": 0.5576180815696716, + "learning_rate": 0.00012941021242908034, + "loss": 0.0416, + "step": 2680 + }, + { + "epoch": 0.7079296025311449, + "grad_norm": 0.017131298780441284, + "learning_rate": 0.0001292782689009104, + "loss": 0.0235, + "step": 2685 + }, + { + "epoch": 0.7092479071913519, + "grad_norm": 0.8517888784408569, + "learning_rate": 0.00012914632537274048, + "loss": 0.0168, + "step": 2690 + }, + { + "epoch": 0.7105662118515589, + "grad_norm": 0.23812156915664673, + "learning_rate": 0.00012901438184457052, + "loss": 0.0483, + "step": 2695 + }, + { + "epoch": 0.7118845165117659, + "grad_norm": 0.11746613681316376, + "learning_rate": 0.0001288824383164006, + "loss": 0.0255, + "step": 2700 + }, + { + "epoch": 0.7132028211719729, + "grad_norm": 0.20089928805828094, + "learning_rate": 0.00012875049478823064, + "loss": 0.0267, + "step": 2705 + }, + { + "epoch": 0.7145211258321799, + "grad_norm": 0.8301129937171936, + "learning_rate": 0.0001286185512600607, + "loss": 0.016, + "step": 2710 + }, + { + "epoch": 0.7158394304923867, + "grad_norm": 0.01838674768805504, + "learning_rate": 0.00012848660773189077, + "loss": 0.0229, + "step": 2715 + }, + { + "epoch": 0.7171577351525937, + "grad_norm": 0.03670337051153183, + "learning_rate": 0.00012835466420372082, + "loss": 0.038, + "step": 2720 + }, + { + "epoch": 0.7184760398128007, + "grad_norm": 0.0452633760869503, + "learning_rate": 0.00012822272067555089, + "loss": 0.0622, + "step": 2725 + }, + { + "epoch": 0.7197943444730077, + "grad_norm": 0.09503110498189926, + "learning_rate": 0.00012809077714738093, + "loss": 0.0209, + "step": 2730 + }, + { + "epoch": 0.7211126491332147, + "grad_norm": 1.0327308177947998, + "learning_rate": 0.000127958833619211, + "loss": 0.0361, + "step": 2735 + }, + { + "epoch": 0.7224309537934217, + "grad_norm": 1.0049290657043457, + "learning_rate": 0.00012782689009104104, + "loss": 0.0365, + "step": 2740 + }, + { + "epoch": 0.7237492584536286, + "grad_norm": 0.029774073511362076, + "learning_rate": 0.00012769494656287108, + "loss": 0.0257, + "step": 2745 + }, + { + "epoch": 0.7250675631138356, + "grad_norm": 0.20974040031433105, + "learning_rate": 0.00012756300303470115, + "loss": 0.0542, + "step": 2750 + }, + { + "epoch": 0.7263858677740426, + "grad_norm": 0.8153854608535767, + "learning_rate": 0.0001274310595065312, + "loss": 0.0216, + "step": 2755 + }, + { + "epoch": 0.7277041724342496, + "grad_norm": 0.4393698573112488, + "learning_rate": 0.00012729911597836126, + "loss": 0.0451, + "step": 2760 + }, + { + "epoch": 0.7290224770944566, + "grad_norm": 0.06990349292755127, + "learning_rate": 0.00012716717245019133, + "loss": 0.03, + "step": 2765 + }, + { + "epoch": 0.7303407817546635, + "grad_norm": 0.32689470052719116, + "learning_rate": 0.00012703522892202137, + "loss": 0.0263, + "step": 2770 + }, + { + "epoch": 0.7316590864148704, + "grad_norm": 0.026600876823067665, + "learning_rate": 0.00012690328539385144, + "loss": 0.0404, + "step": 2775 + }, + { + "epoch": 0.7329773910750774, + "grad_norm": 0.11228257417678833, + "learning_rate": 0.00012677134186568148, + "loss": 0.0224, + "step": 2780 + }, + { + "epoch": 0.7342956957352844, + "grad_norm": 0.6469443440437317, + "learning_rate": 0.00012663939833751155, + "loss": 0.0178, + "step": 2785 + }, + { + "epoch": 0.7356140003954914, + "grad_norm": 0.020773250609636307, + "learning_rate": 0.0001265074548093416, + "loss": 0.011, + "step": 2790 + }, + { + "epoch": 0.7369323050556984, + "grad_norm": 0.7378728985786438, + "learning_rate": 0.00012637551128117167, + "loss": 0.0227, + "step": 2795 + }, + { + "epoch": 0.7382506097159054, + "grad_norm": 0.008189595304429531, + "learning_rate": 0.00012624356775300173, + "loss": 0.0892, + "step": 2800 + }, + { + "epoch": 0.7395689143761123, + "grad_norm": 0.031633853912353516, + "learning_rate": 0.00012611162422483178, + "loss": 0.0093, + "step": 2805 + }, + { + "epoch": 0.7408872190363193, + "grad_norm": 0.5078475475311279, + "learning_rate": 0.00012597968069666185, + "loss": 0.0567, + "step": 2810 + }, + { + "epoch": 0.7422055236965263, + "grad_norm": 0.21766887605190277, + "learning_rate": 0.0001258477371684919, + "loss": 0.0485, + "step": 2815 + }, + { + "epoch": 0.7435238283567333, + "grad_norm": 0.3029612898826599, + "learning_rate": 0.00012571579364032196, + "loss": 0.032, + "step": 2820 + }, + { + "epoch": 0.7448421330169402, + "grad_norm": 1.2135159969329834, + "learning_rate": 0.00012558385011215203, + "loss": 0.0139, + "step": 2825 + }, + { + "epoch": 0.7461604376771472, + "grad_norm": 0.016875172033905983, + "learning_rate": 0.00012545190658398207, + "loss": 0.0323, + "step": 2830 + }, + { + "epoch": 0.7474787423373541, + "grad_norm": 0.08923230320215225, + "learning_rate": 0.00012531996305581214, + "loss": 0.0343, + "step": 2835 + }, + { + "epoch": 0.7487970469975611, + "grad_norm": 0.2958766520023346, + "learning_rate": 0.00012518801952764215, + "loss": 0.0431, + "step": 2840 + }, + { + "epoch": 0.7501153516577681, + "grad_norm": 0.7344386577606201, + "learning_rate": 0.00012505607599947222, + "loss": 0.0389, + "step": 2845 + }, + { + "epoch": 0.7514336563179751, + "grad_norm": 0.03681635856628418, + "learning_rate": 0.0001249241324713023, + "loss": 0.0258, + "step": 2850 + }, + { + "epoch": 0.7527519609781821, + "grad_norm": 0.22866861522197723, + "learning_rate": 0.00012479218894313233, + "loss": 0.0223, + "step": 2855 + }, + { + "epoch": 0.7540702656383891, + "grad_norm": 0.029770435765385628, + "learning_rate": 0.0001246602454149624, + "loss": 0.0205, + "step": 2860 + }, + { + "epoch": 0.755388570298596, + "grad_norm": 0.011845707893371582, + "learning_rate": 0.00012452830188679244, + "loss": 0.0252, + "step": 2865 + }, + { + "epoch": 0.756706874958803, + "grad_norm": 0.06696149706840515, + "learning_rate": 0.00012439635835862251, + "loss": 0.0166, + "step": 2870 + }, + { + "epoch": 0.75802517961901, + "grad_norm": 0.01653144136071205, + "learning_rate": 0.00012426441483045256, + "loss": 0.0487, + "step": 2875 + }, + { + "epoch": 0.7593434842792169, + "grad_norm": 0.031312476843595505, + "learning_rate": 0.00012413247130228263, + "loss": 0.0155, + "step": 2880 + }, + { + "epoch": 0.7606617889394239, + "grad_norm": 0.011625733226537704, + "learning_rate": 0.0001240005277741127, + "loss": 0.0333, + "step": 2885 + }, + { + "epoch": 0.7619800935996309, + "grad_norm": 0.012089414522051811, + "learning_rate": 0.00012386858424594274, + "loss": 0.003, + "step": 2890 + }, + { + "epoch": 0.7632983982598378, + "grad_norm": 0.3012307584285736, + "learning_rate": 0.0001237366407177728, + "loss": 0.0172, + "step": 2895 + }, + { + "epoch": 0.7646167029200448, + "grad_norm": 0.31575000286102295, + "learning_rate": 0.00012360469718960285, + "loss": 0.0409, + "step": 2900 + }, + { + "epoch": 0.7659350075802518, + "grad_norm": 0.009794364683330059, + "learning_rate": 0.00012347275366143292, + "loss": 0.0214, + "step": 2905 + }, + { + "epoch": 0.7672533122404588, + "grad_norm": 0.5973085165023804, + "learning_rate": 0.00012334081013326299, + "loss": 0.0245, + "step": 2910 + }, + { + "epoch": 0.7685716169006658, + "grad_norm": 0.019750040024518967, + "learning_rate": 0.00012320886660509303, + "loss": 0.0063, + "step": 2915 + }, + { + "epoch": 0.7698899215608728, + "grad_norm": 0.06402858346700668, + "learning_rate": 0.0001230769230769231, + "loss": 0.0444, + "step": 2920 + }, + { + "epoch": 0.7712082262210797, + "grad_norm": 0.02876671403646469, + "learning_rate": 0.00012294497954875314, + "loss": 0.0103, + "step": 2925 + }, + { + "epoch": 0.7725265308812866, + "grad_norm": 0.6962207555770874, + "learning_rate": 0.0001228130360205832, + "loss": 0.0318, + "step": 2930 + }, + { + "epoch": 0.7738448355414936, + "grad_norm": 0.006536522414535284, + "learning_rate": 0.00012268109249241325, + "loss": 0.0096, + "step": 2935 + }, + { + "epoch": 0.7751631402017006, + "grad_norm": 0.07097168266773224, + "learning_rate": 0.0001225491489642433, + "loss": 0.0174, + "step": 2940 + }, + { + "epoch": 0.7764814448619076, + "grad_norm": 0.042360126972198486, + "learning_rate": 0.00012241720543607336, + "loss": 0.0158, + "step": 2945 + }, + { + "epoch": 0.7777997495221146, + "grad_norm": 0.01159572321921587, + "learning_rate": 0.0001222852619079034, + "loss": 0.0265, + "step": 2950 + }, + { + "epoch": 0.7791180541823215, + "grad_norm": 0.38408163189888, + "learning_rate": 0.00012215331837973347, + "loss": 0.0233, + "step": 2955 + }, + { + "epoch": 0.7804363588425285, + "grad_norm": 0.15588605403900146, + "learning_rate": 0.00012202137485156353, + "loss": 0.0041, + "step": 2960 + }, + { + "epoch": 0.7817546635027355, + "grad_norm": 0.006892362609505653, + "learning_rate": 0.00012188943132339358, + "loss": 0.0026, + "step": 2965 + }, + { + "epoch": 0.7830729681629425, + "grad_norm": 0.030915727838873863, + "learning_rate": 0.00012175748779522364, + "loss": 0.0028, + "step": 2970 + }, + { + "epoch": 0.7843912728231495, + "grad_norm": 0.8151025772094727, + "learning_rate": 0.00012162554426705371, + "loss": 0.0429, + "step": 2975 + }, + { + "epoch": 0.7857095774833565, + "grad_norm": 0.6765475273132324, + "learning_rate": 0.00012149360073888377, + "loss": 0.0319, + "step": 2980 + }, + { + "epoch": 0.7870278821435633, + "grad_norm": 0.054469238966703415, + "learning_rate": 0.00012136165721071382, + "loss": 0.0413, + "step": 2985 + }, + { + "epoch": 0.7883461868037703, + "grad_norm": 0.045610666275024414, + "learning_rate": 0.00012122971368254388, + "loss": 0.0521, + "step": 2990 + }, + { + "epoch": 0.7896644914639773, + "grad_norm": 0.4222470223903656, + "learning_rate": 0.00012109777015437393, + "loss": 0.0846, + "step": 2995 + }, + { + "epoch": 0.7909827961241843, + "grad_norm": 0.0272397268563509, + "learning_rate": 0.00012096582662620399, + "loss": 0.0364, + "step": 3000 + }, + { + "epoch": 0.7909827961241843, + "eval_loss": 0.033312585204839706, + "eval_runtime": 452.2552, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 3.728, + "step": 3000 + }, + { + "epoch": 0.7923011007843913, + "grad_norm": 0.08674059063196182, + "learning_rate": 0.00012083388309803406, + "loss": 0.0081, + "step": 3005 + }, + { + "epoch": 0.7936194054445982, + "grad_norm": 0.21960832178592682, + "learning_rate": 0.00012070193956986411, + "loss": 0.0468, + "step": 3010 + }, + { + "epoch": 0.7949377101048052, + "grad_norm": 0.11259289085865021, + "learning_rate": 0.00012056999604169417, + "loss": 0.0124, + "step": 3015 + }, + { + "epoch": 0.7962560147650122, + "grad_norm": 0.02945362776517868, + "learning_rate": 0.00012043805251352422, + "loss": 0.0298, + "step": 3020 + }, + { + "epoch": 0.7975743194252192, + "grad_norm": 0.27889615297317505, + "learning_rate": 0.00012030610898535428, + "loss": 0.0251, + "step": 3025 + }, + { + "epoch": 0.7988926240854262, + "grad_norm": 0.05873241275548935, + "learning_rate": 0.00012017416545718434, + "loss": 0.0132, + "step": 3030 + }, + { + "epoch": 0.8002109287456332, + "grad_norm": 0.1570046991109848, + "learning_rate": 0.00012004222192901439, + "loss": 0.0228, + "step": 3035 + }, + { + "epoch": 0.80152923340584, + "grad_norm": 0.12575332820415497, + "learning_rate": 0.00011991027840084443, + "loss": 0.0049, + "step": 3040 + }, + { + "epoch": 0.802847538066047, + "grad_norm": 0.8416435122489929, + "learning_rate": 0.00011977833487267449, + "loss": 0.0542, + "step": 3045 + }, + { + "epoch": 0.804165842726254, + "grad_norm": 0.2605098485946655, + "learning_rate": 0.00011964639134450454, + "loss": 0.0084, + "step": 3050 + }, + { + "epoch": 0.805484147386461, + "grad_norm": 0.8996294736862183, + "learning_rate": 0.00011951444781633461, + "loss": 0.0442, + "step": 3055 + }, + { + "epoch": 0.806802452046668, + "grad_norm": 2.7525105476379395, + "learning_rate": 0.00011938250428816467, + "loss": 0.0642, + "step": 3060 + }, + { + "epoch": 0.808120756706875, + "grad_norm": 0.14955930411815643, + "learning_rate": 0.00011925056075999473, + "loss": 0.0384, + "step": 3065 + }, + { + "epoch": 0.8094390613670819, + "grad_norm": 0.018756115809082985, + "learning_rate": 0.00011911861723182478, + "loss": 0.0154, + "step": 3070 + }, + { + "epoch": 0.8107573660272889, + "grad_norm": 0.23998615145683289, + "learning_rate": 0.00011898667370365484, + "loss": 0.0413, + "step": 3075 + }, + { + "epoch": 0.8120756706874959, + "grad_norm": 0.27253249287605286, + "learning_rate": 0.00011885473017548489, + "loss": 0.0081, + "step": 3080 + }, + { + "epoch": 0.8133939753477029, + "grad_norm": 0.2925993502140045, + "learning_rate": 0.00011872278664731495, + "loss": 0.0332, + "step": 3085 + }, + { + "epoch": 0.8147122800079099, + "grad_norm": 0.5364832878112793, + "learning_rate": 0.00011859084311914502, + "loss": 0.0143, + "step": 3090 + }, + { + "epoch": 0.8160305846681168, + "grad_norm": 0.32104921340942383, + "learning_rate": 0.00011845889959097507, + "loss": 0.0216, + "step": 3095 + }, + { + "epoch": 0.8173488893283237, + "grad_norm": 0.0205856766551733, + "learning_rate": 0.00011832695606280513, + "loss": 0.0346, + "step": 3100 + }, + { + "epoch": 0.8186671939885307, + "grad_norm": 0.2541547417640686, + "learning_rate": 0.00011819501253463518, + "loss": 0.0793, + "step": 3105 + }, + { + "epoch": 0.8199854986487377, + "grad_norm": 0.08333491533994675, + "learning_rate": 0.00011806306900646524, + "loss": 0.0049, + "step": 3110 + }, + { + "epoch": 0.8213038033089447, + "grad_norm": 0.0355968177318573, + "learning_rate": 0.0001179311254782953, + "loss": 0.0051, + "step": 3115 + }, + { + "epoch": 0.8226221079691517, + "grad_norm": 0.06948401033878326, + "learning_rate": 0.00011779918195012536, + "loss": 0.013, + "step": 3120 + }, + { + "epoch": 0.8239404126293587, + "grad_norm": 0.03328891843557358, + "learning_rate": 0.00011766723842195542, + "loss": 0.0122, + "step": 3125 + }, + { + "epoch": 0.8252587172895656, + "grad_norm": 0.013782350346446037, + "learning_rate": 0.00011753529489378548, + "loss": 0.0073, + "step": 3130 + }, + { + "epoch": 0.8265770219497726, + "grad_norm": 0.024390392005443573, + "learning_rate": 0.00011740335136561553, + "loss": 0.0143, + "step": 3135 + }, + { + "epoch": 0.8278953266099796, + "grad_norm": 0.002548128366470337, + "learning_rate": 0.00011727140783744557, + "loss": 0.0027, + "step": 3140 + }, + { + "epoch": 0.8292136312701865, + "grad_norm": 0.11674848943948746, + "learning_rate": 0.00011713946430927563, + "loss": 0.0253, + "step": 3145 + }, + { + "epoch": 0.8305319359303935, + "grad_norm": 0.005774884019047022, + "learning_rate": 0.00011700752078110568, + "loss": 0.0018, + "step": 3150 + }, + { + "epoch": 0.8318502405906005, + "grad_norm": 0.5763069987297058, + "learning_rate": 0.00011687557725293574, + "loss": 0.0119, + "step": 3155 + }, + { + "epoch": 0.8331685452508074, + "grad_norm": 0.0027607593219727278, + "learning_rate": 0.0001167436337247658, + "loss": 0.0279, + "step": 3160 + }, + { + "epoch": 0.8344868499110144, + "grad_norm": 1.859642505645752, + "learning_rate": 0.00011661169019659585, + "loss": 0.0228, + "step": 3165 + }, + { + "epoch": 0.8358051545712214, + "grad_norm": 0.16597022116184235, + "learning_rate": 0.00011647974666842592, + "loss": 0.1228, + "step": 3170 + }, + { + "epoch": 0.8371234592314284, + "grad_norm": 0.33833742141723633, + "learning_rate": 0.00011634780314025598, + "loss": 0.073, + "step": 3175 + }, + { + "epoch": 0.8384417638916354, + "grad_norm": 0.024682912975549698, + "learning_rate": 0.00011621585961208603, + "loss": 0.0042, + "step": 3180 + }, + { + "epoch": 0.8397600685518424, + "grad_norm": 0.05926942452788353, + "learning_rate": 0.00011608391608391609, + "loss": 0.0066, + "step": 3185 + }, + { + "epoch": 0.8410783732120493, + "grad_norm": 0.1414029747247696, + "learning_rate": 0.00011595197255574614, + "loss": 0.0603, + "step": 3190 + }, + { + "epoch": 0.8423966778722562, + "grad_norm": 0.37928736209869385, + "learning_rate": 0.0001158200290275762, + "loss": 0.0266, + "step": 3195 + }, + { + "epoch": 0.8437149825324632, + "grad_norm": 0.018329354003071785, + "learning_rate": 0.00011568808549940627, + "loss": 0.0047, + "step": 3200 + }, + { + "epoch": 0.8450332871926702, + "grad_norm": 0.2993735373020172, + "learning_rate": 0.00011555614197123632, + "loss": 0.0218, + "step": 3205 + }, + { + "epoch": 0.8463515918528772, + "grad_norm": 0.1767728328704834, + "learning_rate": 0.00011542419844306638, + "loss": 0.0363, + "step": 3210 + }, + { + "epoch": 0.8476698965130842, + "grad_norm": 0.39774414896965027, + "learning_rate": 0.00011529225491489644, + "loss": 0.0506, + "step": 3215 + }, + { + "epoch": 0.8489882011732911, + "grad_norm": 0.021896762773394585, + "learning_rate": 0.00011516031138672649, + "loss": 0.0081, + "step": 3220 + }, + { + "epoch": 0.8503065058334981, + "grad_norm": 0.358372300863266, + "learning_rate": 0.00011502836785855655, + "loss": 0.0224, + "step": 3225 + }, + { + "epoch": 0.8516248104937051, + "grad_norm": 0.01605542004108429, + "learning_rate": 0.00011489642433038662, + "loss": 0.0215, + "step": 3230 + }, + { + "epoch": 0.8529431151539121, + "grad_norm": 0.021189266815781593, + "learning_rate": 0.00011476448080221667, + "loss": 0.0051, + "step": 3235 + }, + { + "epoch": 0.8542614198141191, + "grad_norm": 0.013394076377153397, + "learning_rate": 0.0001146325372740467, + "loss": 0.021, + "step": 3240 + }, + { + "epoch": 0.8555797244743261, + "grad_norm": 0.19848507642745972, + "learning_rate": 0.00011450059374587676, + "loss": 0.0285, + "step": 3245 + }, + { + "epoch": 0.856898029134533, + "grad_norm": 0.2463046759366989, + "learning_rate": 0.00011436865021770683, + "loss": 0.0384, + "step": 3250 + }, + { + "epoch": 0.8582163337947399, + "grad_norm": 0.37432390451431274, + "learning_rate": 0.00011423670668953688, + "loss": 0.0098, + "step": 3255 + }, + { + "epoch": 0.8595346384549469, + "grad_norm": 0.060943394899368286, + "learning_rate": 0.00011410476316136694, + "loss": 0.0087, + "step": 3260 + }, + { + "epoch": 0.8608529431151539, + "grad_norm": 0.2846696674823761, + "learning_rate": 0.00011397281963319699, + "loss": 0.0148, + "step": 3265 + }, + { + "epoch": 0.8621712477753609, + "grad_norm": 0.009311323054134846, + "learning_rate": 0.00011384087610502705, + "loss": 0.0024, + "step": 3270 + }, + { + "epoch": 0.8634895524355679, + "grad_norm": 0.046277035027742386, + "learning_rate": 0.0001137089325768571, + "loss": 0.0274, + "step": 3275 + }, + { + "epoch": 0.8648078570957748, + "grad_norm": 0.006024620030075312, + "learning_rate": 0.00011357698904868716, + "loss": 0.0286, + "step": 3280 + }, + { + "epoch": 0.8661261617559818, + "grad_norm": 0.033578380942344666, + "learning_rate": 0.00011344504552051723, + "loss": 0.0153, + "step": 3285 + }, + { + "epoch": 0.8674444664161888, + "grad_norm": 0.8537917137145996, + "learning_rate": 0.00011331310199234728, + "loss": 0.0304, + "step": 3290 + }, + { + "epoch": 0.8687627710763958, + "grad_norm": 0.013933337293565273, + "learning_rate": 0.00011318115846417734, + "loss": 0.0112, + "step": 3295 + }, + { + "epoch": 0.8700810757366028, + "grad_norm": 0.35437721014022827, + "learning_rate": 0.0001130492149360074, + "loss": 0.0228, + "step": 3300 + }, + { + "epoch": 0.8713993803968098, + "grad_norm": 1.3024121522903442, + "learning_rate": 0.00011291727140783745, + "loss": 0.0203, + "step": 3305 + }, + { + "epoch": 0.8727176850570166, + "grad_norm": 0.5131255984306335, + "learning_rate": 0.00011278532787966751, + "loss": 0.0181, + "step": 3310 + }, + { + "epoch": 0.8740359897172236, + "grad_norm": 0.039366886019706726, + "learning_rate": 0.00011265338435149758, + "loss": 0.0192, + "step": 3315 + }, + { + "epoch": 0.8753542943774306, + "grad_norm": 0.13679669797420502, + "learning_rate": 0.00011252144082332763, + "loss": 0.004, + "step": 3320 + }, + { + "epoch": 0.8766725990376376, + "grad_norm": 0.003076886525377631, + "learning_rate": 0.00011238949729515769, + "loss": 0.0405, + "step": 3325 + }, + { + "epoch": 0.8779909036978446, + "grad_norm": 0.019953785464167595, + "learning_rate": 0.00011225755376698774, + "loss": 0.0241, + "step": 3330 + }, + { + "epoch": 0.8793092083580516, + "grad_norm": 0.007980377413332462, + "learning_rate": 0.0001121256102388178, + "loss": 0.0064, + "step": 3335 + }, + { + "epoch": 0.8806275130182585, + "grad_norm": 0.018761295825242996, + "learning_rate": 0.00011199366671064784, + "loss": 0.0032, + "step": 3340 + }, + { + "epoch": 0.8819458176784655, + "grad_norm": 0.022511709481477737, + "learning_rate": 0.0001118617231824779, + "loss": 0.0055, + "step": 3345 + }, + { + "epoch": 0.8832641223386725, + "grad_norm": 0.021270718425512314, + "learning_rate": 0.00011172977965430795, + "loss": 0.033, + "step": 3350 + }, + { + "epoch": 0.8845824269988795, + "grad_norm": 0.02710561640560627, + "learning_rate": 0.00011159783612613801, + "loss": 0.0094, + "step": 3355 + }, + { + "epoch": 0.8859007316590864, + "grad_norm": 0.4353378117084503, + "learning_rate": 0.00011146589259796806, + "loss": 0.0089, + "step": 3360 + }, + { + "epoch": 0.8872190363192934, + "grad_norm": 0.0257766991853714, + "learning_rate": 0.00011133394906979813, + "loss": 0.0059, + "step": 3365 + }, + { + "epoch": 0.8885373409795003, + "grad_norm": 0.80838942527771, + "learning_rate": 0.00011120200554162819, + "loss": 0.0263, + "step": 3370 + }, + { + "epoch": 0.8898556456397073, + "grad_norm": 0.007799761835485697, + "learning_rate": 0.00011107006201345824, + "loss": 0.0028, + "step": 3375 + }, + { + "epoch": 0.8911739502999143, + "grad_norm": 0.007315775845199823, + "learning_rate": 0.0001109381184852883, + "loss": 0.0127, + "step": 3380 + }, + { + "epoch": 0.8924922549601213, + "grad_norm": 1.4861233234405518, + "learning_rate": 0.00011080617495711836, + "loss": 0.0562, + "step": 3385 + }, + { + "epoch": 0.8938105596203283, + "grad_norm": 0.010219530202448368, + "learning_rate": 0.00011067423142894841, + "loss": 0.0438, + "step": 3390 + }, + { + "epoch": 0.8951288642805353, + "grad_norm": 1.0191857814788818, + "learning_rate": 0.00011054228790077848, + "loss": 0.0493, + "step": 3395 + }, + { + "epoch": 0.8964471689407422, + "grad_norm": 0.01459536887705326, + "learning_rate": 0.00011041034437260854, + "loss": 0.0117, + "step": 3400 + }, + { + "epoch": 0.8977654736009492, + "grad_norm": 0.008682495914399624, + "learning_rate": 0.00011027840084443859, + "loss": 0.02, + "step": 3405 + }, + { + "epoch": 0.8990837782611562, + "grad_norm": 0.02197263017296791, + "learning_rate": 0.00011014645731626865, + "loss": 0.0454, + "step": 3410 + }, + { + "epoch": 0.9004020829213631, + "grad_norm": 0.01436714269220829, + "learning_rate": 0.0001100145137880987, + "loss": 0.0283, + "step": 3415 + }, + { + "epoch": 0.9017203875815701, + "grad_norm": 0.14327946305274963, + "learning_rate": 0.00010988257025992876, + "loss": 0.0461, + "step": 3420 + }, + { + "epoch": 0.9030386922417771, + "grad_norm": 1.671773910522461, + "learning_rate": 0.00010975062673175883, + "loss": 0.054, + "step": 3425 + }, + { + "epoch": 0.904356996901984, + "grad_norm": 0.009926804341375828, + "learning_rate": 0.00010961868320358888, + "loss": 0.0429, + "step": 3430 + }, + { + "epoch": 0.905675301562191, + "grad_norm": 0.554020881652832, + "learning_rate": 0.00010948673967541894, + "loss": 0.0618, + "step": 3435 + }, + { + "epoch": 0.906993606222398, + "grad_norm": 0.1399248093366623, + "learning_rate": 0.00010935479614724897, + "loss": 0.0229, + "step": 3440 + }, + { + "epoch": 0.908311910882605, + "grad_norm": 0.02739197015762329, + "learning_rate": 0.00010922285261907904, + "loss": 0.0082, + "step": 3445 + }, + { + "epoch": 0.909630215542812, + "grad_norm": 0.33394527435302734, + "learning_rate": 0.00010909090909090909, + "loss": 0.0403, + "step": 3450 + }, + { + "epoch": 0.9109485202030189, + "grad_norm": 0.08083894103765488, + "learning_rate": 0.00010895896556273915, + "loss": 0.0406, + "step": 3455 + }, + { + "epoch": 0.9122668248632259, + "grad_norm": 0.39336663484573364, + "learning_rate": 0.0001088270220345692, + "loss": 0.02, + "step": 3460 + }, + { + "epoch": 0.9135851295234328, + "grad_norm": 0.20481553673744202, + "learning_rate": 0.00010869507850639926, + "loss": 0.0221, + "step": 3465 + }, + { + "epoch": 0.9149034341836398, + "grad_norm": 1.4507408142089844, + "learning_rate": 0.00010856313497822932, + "loss": 0.0357, + "step": 3470 + }, + { + "epoch": 0.9162217388438468, + "grad_norm": 0.2678806483745575, + "learning_rate": 0.00010843119145005937, + "loss": 0.0181, + "step": 3475 + }, + { + "epoch": 0.9175400435040538, + "grad_norm": 0.007361674215644598, + "learning_rate": 0.00010829924792188944, + "loss": 0.0978, + "step": 3480 + }, + { + "epoch": 0.9188583481642607, + "grad_norm": 0.773695707321167, + "learning_rate": 0.0001081673043937195, + "loss": 0.0401, + "step": 3485 + }, + { + "epoch": 0.9201766528244677, + "grad_norm": 0.0010772625682875514, + "learning_rate": 0.00010803536086554955, + "loss": 0.0233, + "step": 3490 + }, + { + "epoch": 0.9214949574846747, + "grad_norm": 0.08971104770898819, + "learning_rate": 0.00010790341733737961, + "loss": 0.0319, + "step": 3495 + }, + { + "epoch": 0.9228132621448817, + "grad_norm": 0.21372731029987335, + "learning_rate": 0.00010777147380920966, + "loss": 0.0315, + "step": 3500 + }, + { + "epoch": 0.9228132621448817, + "eval_loss": 0.02952708676457405, + "eval_runtime": 451.5837, + "eval_samples_per_second": 7.467, + "eval_steps_per_second": 3.734, + "step": 3500 + }, + { + "epoch": 0.9241315668050887, + "grad_norm": 0.016639264300465584, + "learning_rate": 0.00010763953028103972, + "loss": 0.0125, + "step": 3505 + }, + { + "epoch": 0.9254498714652957, + "grad_norm": 0.46340492367744446, + "learning_rate": 0.00010750758675286979, + "loss": 0.0186, + "step": 3510 + }, + { + "epoch": 0.9267681761255026, + "grad_norm": 0.01847526989877224, + "learning_rate": 0.00010737564322469984, + "loss": 0.0026, + "step": 3515 + }, + { + "epoch": 0.9280864807857095, + "grad_norm": 0.5947860479354858, + "learning_rate": 0.0001072436996965299, + "loss": 0.0259, + "step": 3520 + }, + { + "epoch": 0.9294047854459165, + "grad_norm": 0.06145291402935982, + "learning_rate": 0.00010711175616835995, + "loss": 0.0057, + "step": 3525 + }, + { + "epoch": 0.9307230901061235, + "grad_norm": 0.0143959429115057, + "learning_rate": 0.00010697981264019001, + "loss": 0.0145, + "step": 3530 + }, + { + "epoch": 0.9320413947663305, + "grad_norm": 0.21143831312656403, + "learning_rate": 0.00010684786911202007, + "loss": 0.0459, + "step": 3535 + }, + { + "epoch": 0.9333596994265375, + "grad_norm": 0.02548077143728733, + "learning_rate": 0.00010671592558385011, + "loss": 0.0051, + "step": 3540 + }, + { + "epoch": 0.9346780040867444, + "grad_norm": 0.008077048696577549, + "learning_rate": 0.00010658398205568016, + "loss": 0.0306, + "step": 3545 + }, + { + "epoch": 0.9359963087469514, + "grad_norm": 0.0030760422814637423, + "learning_rate": 0.00010645203852751022, + "loss": 0.0575, + "step": 3550 + }, + { + "epoch": 0.9373146134071584, + "grad_norm": 0.18114158511161804, + "learning_rate": 0.00010632009499934027, + "loss": 0.0885, + "step": 3555 + }, + { + "epoch": 0.9386329180673654, + "grad_norm": 0.02450549602508545, + "learning_rate": 0.00010618815147117034, + "loss": 0.0045, + "step": 3560 + }, + { + "epoch": 0.9399512227275724, + "grad_norm": 0.1238626018166542, + "learning_rate": 0.0001060562079430004, + "loss": 0.0166, + "step": 3565 + }, + { + "epoch": 0.9412695273877794, + "grad_norm": 0.1879919469356537, + "learning_rate": 0.00010592426441483046, + "loss": 0.0077, + "step": 3570 + }, + { + "epoch": 0.9425878320479862, + "grad_norm": 0.11323565989732742, + "learning_rate": 0.00010579232088666051, + "loss": 0.0213, + "step": 3575 + }, + { + "epoch": 0.9439061367081932, + "grad_norm": 0.35575854778289795, + "learning_rate": 0.00010566037735849057, + "loss": 0.0336, + "step": 3580 + }, + { + "epoch": 0.9452244413684002, + "grad_norm": 0.14052227139472961, + "learning_rate": 0.00010552843383032062, + "loss": 0.0325, + "step": 3585 + }, + { + "epoch": 0.9465427460286072, + "grad_norm": 0.2643798887729645, + "learning_rate": 0.00010539649030215069, + "loss": 0.0192, + "step": 3590 + }, + { + "epoch": 0.9478610506888142, + "grad_norm": 0.3207031190395355, + "learning_rate": 0.00010526454677398075, + "loss": 0.0221, + "step": 3595 + }, + { + "epoch": 0.9491793553490212, + "grad_norm": 0.022803861647844315, + "learning_rate": 0.0001051326032458108, + "loss": 0.029, + "step": 3600 + }, + { + "epoch": 0.9504976600092281, + "grad_norm": 0.02511664852499962, + "learning_rate": 0.00010500065971764086, + "loss": 0.0422, + "step": 3605 + }, + { + "epoch": 0.9518159646694351, + "grad_norm": 0.06505445390939713, + "learning_rate": 0.00010486871618947091, + "loss": 0.0092, + "step": 3610 + }, + { + "epoch": 0.9531342693296421, + "grad_norm": 0.09998584538698196, + "learning_rate": 0.00010473677266130097, + "loss": 0.0242, + "step": 3615 + }, + { + "epoch": 0.9544525739898491, + "grad_norm": 0.9645698666572571, + "learning_rate": 0.00010460482913313104, + "loss": 0.0124, + "step": 3620 + }, + { + "epoch": 0.955770878650056, + "grad_norm": 0.2389964610338211, + "learning_rate": 0.0001044728856049611, + "loss": 0.0169, + "step": 3625 + }, + { + "epoch": 0.957089183310263, + "grad_norm": 2.030608654022217, + "learning_rate": 0.00010434094207679115, + "loss": 0.0518, + "step": 3630 + }, + { + "epoch": 0.9584074879704699, + "grad_norm": 0.05979987606406212, + "learning_rate": 0.0001042089985486212, + "loss": 0.0081, + "step": 3635 + }, + { + "epoch": 0.9597257926306769, + "grad_norm": 0.15761719644069672, + "learning_rate": 0.00010407705502045125, + "loss": 0.0061, + "step": 3640 + }, + { + "epoch": 0.9610440972908839, + "grad_norm": 0.6534290909767151, + "learning_rate": 0.0001039451114922813, + "loss": 0.0104, + "step": 3645 + }, + { + "epoch": 0.9623624019510909, + "grad_norm": 1.0324147939682007, + "learning_rate": 0.00010381316796411136, + "loss": 0.0381, + "step": 3650 + }, + { + "epoch": 0.9636807066112979, + "grad_norm": 0.002968872431665659, + "learning_rate": 0.00010368122443594142, + "loss": 0.0343, + "step": 3655 + }, + { + "epoch": 0.9649990112715049, + "grad_norm": 0.011243184097111225, + "learning_rate": 0.00010354928090777147, + "loss": 0.019, + "step": 3660 + }, + { + "epoch": 0.9663173159317118, + "grad_norm": 0.17663739621639252, + "learning_rate": 0.00010341733737960153, + "loss": 0.0452, + "step": 3665 + }, + { + "epoch": 0.9676356205919188, + "grad_norm": 1.2647719383239746, + "learning_rate": 0.00010328539385143158, + "loss": 0.0154, + "step": 3670 + }, + { + "epoch": 0.9689539252521258, + "grad_norm": 0.3691752552986145, + "learning_rate": 0.00010315345032326165, + "loss": 0.028, + "step": 3675 + }, + { + "epoch": 0.9702722299123328, + "grad_norm": 0.0015879774000495672, + "learning_rate": 0.00010302150679509171, + "loss": 0.0202, + "step": 3680 + }, + { + "epoch": 0.9715905345725397, + "grad_norm": 0.1441984623670578, + "learning_rate": 0.00010288956326692176, + "loss": 0.0221, + "step": 3685 + }, + { + "epoch": 0.9729088392327467, + "grad_norm": 0.20431455969810486, + "learning_rate": 0.00010275761973875182, + "loss": 0.0072, + "step": 3690 + }, + { + "epoch": 0.9742271438929536, + "grad_norm": 0.861625611782074, + "learning_rate": 0.00010262567621058187, + "loss": 0.0523, + "step": 3695 + }, + { + "epoch": 0.9755454485531606, + "grad_norm": 0.005049478262662888, + "learning_rate": 0.00010249373268241193, + "loss": 0.0051, + "step": 3700 + }, + { + "epoch": 0.9768637532133676, + "grad_norm": 0.49685510993003845, + "learning_rate": 0.000102361789154242, + "loss": 0.023, + "step": 3705 + }, + { + "epoch": 0.9781820578735746, + "grad_norm": 0.08789395540952682, + "learning_rate": 0.00010222984562607205, + "loss": 0.0159, + "step": 3710 + }, + { + "epoch": 0.9795003625337816, + "grad_norm": 0.027168691158294678, + "learning_rate": 0.00010209790209790211, + "loss": 0.0083, + "step": 3715 + }, + { + "epoch": 0.9808186671939886, + "grad_norm": 0.0006773864733986557, + "learning_rate": 0.00010196595856973217, + "loss": 0.0048, + "step": 3720 + }, + { + "epoch": 0.9821369718541955, + "grad_norm": 0.01636457070708275, + "learning_rate": 0.00010183401504156222, + "loss": 0.0159, + "step": 3725 + }, + { + "epoch": 0.9834552765144025, + "grad_norm": 0.10160859674215317, + "learning_rate": 0.00010170207151339228, + "loss": 0.0047, + "step": 3730 + }, + { + "epoch": 0.9847735811746094, + "grad_norm": 0.14173269271850586, + "learning_rate": 0.00010157012798522232, + "loss": 0.006, + "step": 3735 + }, + { + "epoch": 0.9860918858348164, + "grad_norm": 0.003458512481302023, + "learning_rate": 0.00010143818445705238, + "loss": 0.0193, + "step": 3740 + }, + { + "epoch": 0.9874101904950234, + "grad_norm": 0.005163820460438728, + "learning_rate": 0.00010130624092888243, + "loss": 0.0039, + "step": 3745 + }, + { + "epoch": 0.9887284951552304, + "grad_norm": 0.005913791712373495, + "learning_rate": 0.00010117429740071249, + "loss": 0.0119, + "step": 3750 + }, + { + "epoch": 0.9900467998154373, + "grad_norm": 0.00800853967666626, + "learning_rate": 0.00010104235387254256, + "loss": 0.044, + "step": 3755 + }, + { + "epoch": 0.9913651044756443, + "grad_norm": 0.18146778643131256, + "learning_rate": 0.00010091041034437261, + "loss": 0.0048, + "step": 3760 + }, + { + "epoch": 0.9926834091358513, + "grad_norm": 0.01235104724764824, + "learning_rate": 0.00010077846681620267, + "loss": 0.0017, + "step": 3765 + }, + { + "epoch": 0.9940017137960583, + "grad_norm": 0.17677897214889526, + "learning_rate": 0.00010064652328803272, + "loss": 0.0339, + "step": 3770 + }, + { + "epoch": 0.9953200184562653, + "grad_norm": 0.0017472271574661136, + "learning_rate": 0.00010051457975986278, + "loss": 0.0494, + "step": 3775 + }, + { + "epoch": 0.9966383231164723, + "grad_norm": 0.10814860463142395, + "learning_rate": 0.00010038263623169283, + "loss": 0.0741, + "step": 3780 + }, + { + "epoch": 0.9979566277766792, + "grad_norm": 0.11329760402441025, + "learning_rate": 0.0001002506927035229, + "loss": 0.0182, + "step": 3785 + }, + { + "epoch": 0.9992749324368861, + "grad_norm": 0.11573276668787003, + "learning_rate": 0.00010011874917535296, + "loss": 0.0068, + "step": 3790 + }, + { + "epoch": 1.000790982796124, + "grad_norm": 0.08449886739253998, + "learning_rate": 9.998680564718301e-05, + "loss": 0.0141, + "step": 3795 + }, + { + "epoch": 1.002109287456331, + "grad_norm": 0.05035184696316719, + "learning_rate": 9.985486211901307e-05, + "loss": 0.0293, + "step": 3800 + }, + { + "epoch": 1.003427592116538, + "grad_norm": 0.0255444198846817, + "learning_rate": 9.972291859084313e-05, + "loss": 0.0054, + "step": 3805 + }, + { + "epoch": 1.004745896776745, + "grad_norm": 0.0033677336759865284, + "learning_rate": 9.959097506267318e-05, + "loss": 0.0567, + "step": 3810 + }, + { + "epoch": 1.006064201436952, + "grad_norm": 0.09453682601451874, + "learning_rate": 9.945903153450324e-05, + "loss": 0.0589, + "step": 3815 + }, + { + "epoch": 1.007382506097159, + "grad_norm": 0.01592979207634926, + "learning_rate": 9.932708800633329e-05, + "loss": 0.0043, + "step": 3820 + }, + { + "epoch": 1.008700810757366, + "grad_norm": 0.002263693604618311, + "learning_rate": 9.919514447816335e-05, + "loss": 0.0195, + "step": 3825 + }, + { + "epoch": 1.010019115417573, + "grad_norm": 0.013390793465077877, + "learning_rate": 9.90632009499934e-05, + "loss": 0.0152, + "step": 3830 + }, + { + "epoch": 1.01133742007778, + "grad_norm": 0.10473847389221191, + "learning_rate": 9.893125742182346e-05, + "loss": 0.0606, + "step": 3835 + }, + { + "epoch": 1.012655724737987, + "grad_norm": 0.05837221071124077, + "learning_rate": 9.879931389365353e-05, + "loss": 0.0121, + "step": 3840 + }, + { + "epoch": 1.013974029398194, + "grad_norm": 0.3803791105747223, + "learning_rate": 9.866737036548358e-05, + "loss": 0.0386, + "step": 3845 + }, + { + "epoch": 1.0152923340584008, + "grad_norm": 0.4067519009113312, + "learning_rate": 9.853542683731364e-05, + "loss": 0.0115, + "step": 3850 + }, + { + "epoch": 1.0166106387186078, + "grad_norm": 0.02585229091346264, + "learning_rate": 9.84034833091437e-05, + "loss": 0.0214, + "step": 3855 + }, + { + "epoch": 1.0179289433788148, + "grad_norm": 0.03670825809240341, + "learning_rate": 9.827153978097374e-05, + "loss": 0.0059, + "step": 3860 + }, + { + "epoch": 1.0192472480390218, + "grad_norm": 0.014171554706990719, + "learning_rate": 9.81395962528038e-05, + "loss": 0.0145, + "step": 3865 + }, + { + "epoch": 1.0205655526992288, + "grad_norm": 0.027376385405659676, + "learning_rate": 9.800765272463386e-05, + "loss": 0.0089, + "step": 3870 + }, + { + "epoch": 1.0218838573594358, + "grad_norm": 0.03168405964970589, + "learning_rate": 9.787570919646392e-05, + "loss": 0.0132, + "step": 3875 + }, + { + "epoch": 1.0232021620196428, + "grad_norm": 0.03346199914813042, + "learning_rate": 9.774376566829397e-05, + "loss": 0.0246, + "step": 3880 + }, + { + "epoch": 1.0245204666798498, + "grad_norm": 0.00894144270569086, + "learning_rate": 9.761182214012403e-05, + "loss": 0.0105, + "step": 3885 + }, + { + "epoch": 1.0258387713400567, + "grad_norm": 0.3172806203365326, + "learning_rate": 9.747987861195409e-05, + "loss": 0.0103, + "step": 3890 + }, + { + "epoch": 1.0271570760002637, + "grad_norm": 0.009055040776729584, + "learning_rate": 9.734793508378414e-05, + "loss": 0.0103, + "step": 3895 + }, + { + "epoch": 1.0284753806604707, + "grad_norm": 0.014140011742711067, + "learning_rate": 9.721599155561421e-05, + "loss": 0.0037, + "step": 3900 + }, + { + "epoch": 1.0297936853206777, + "grad_norm": 0.008317383006215096, + "learning_rate": 9.708404802744427e-05, + "loss": 0.002, + "step": 3905 + }, + { + "epoch": 1.0311119899808845, + "grad_norm": 0.005038558971136808, + "learning_rate": 9.695210449927431e-05, + "loss": 0.0017, + "step": 3910 + }, + { + "epoch": 1.0324302946410915, + "grad_norm": 0.40058520436286926, + "learning_rate": 9.682016097110436e-05, + "loss": 0.0065, + "step": 3915 + }, + { + "epoch": 1.0337485993012985, + "grad_norm": 0.005197151098400354, + "learning_rate": 9.668821744293442e-05, + "loss": 0.0031, + "step": 3920 + }, + { + "epoch": 1.0350669039615055, + "grad_norm": 0.014353781007230282, + "learning_rate": 9.655627391476449e-05, + "loss": 0.0009, + "step": 3925 + }, + { + "epoch": 1.0363852086217125, + "grad_norm": 0.13260559737682343, + "learning_rate": 9.642433038659454e-05, + "loss": 0.0323, + "step": 3930 + }, + { + "epoch": 1.0377035132819195, + "grad_norm": 0.006795065477490425, + "learning_rate": 9.62923868584246e-05, + "loss": 0.0022, + "step": 3935 + }, + { + "epoch": 1.0390218179421264, + "grad_norm": 0.2276086062192917, + "learning_rate": 9.616044333025466e-05, + "loss": 0.0221, + "step": 3940 + }, + { + "epoch": 1.0403401226023334, + "grad_norm": 0.06121920794248581, + "learning_rate": 9.602849980208471e-05, + "loss": 0.0037, + "step": 3945 + }, + { + "epoch": 1.0416584272625404, + "grad_norm": 0.9180755019187927, + "learning_rate": 9.589655627391477e-05, + "loss": 0.0589, + "step": 3950 + }, + { + "epoch": 1.0429767319227474, + "grad_norm": 0.07515591382980347, + "learning_rate": 9.576461274574484e-05, + "loss": 0.0653, + "step": 3955 + }, + { + "epoch": 1.0442950365829544, + "grad_norm": 0.018060607835650444, + "learning_rate": 9.563266921757488e-05, + "loss": 0.0178, + "step": 3960 + }, + { + "epoch": 1.0456133412431612, + "grad_norm": 0.02751368284225464, + "learning_rate": 9.550072568940493e-05, + "loss": 0.0076, + "step": 3965 + }, + { + "epoch": 1.0469316459033682, + "grad_norm": 0.653998613357544, + "learning_rate": 9.536878216123499e-05, + "loss": 0.0066, + "step": 3970 + }, + { + "epoch": 1.0482499505635752, + "grad_norm": 0.3117768168449402, + "learning_rate": 9.523683863306505e-05, + "loss": 0.0087, + "step": 3975 + }, + { + "epoch": 1.0495682552237822, + "grad_norm": 0.013952831737697124, + "learning_rate": 9.510489510489511e-05, + "loss": 0.0037, + "step": 3980 + }, + { + "epoch": 1.0508865598839892, + "grad_norm": 0.01806250400841236, + "learning_rate": 9.497295157672517e-05, + "loss": 0.0028, + "step": 3985 + }, + { + "epoch": 1.0522048645441962, + "grad_norm": 0.13678006827831268, + "learning_rate": 9.484100804855523e-05, + "loss": 0.0533, + "step": 3990 + }, + { + "epoch": 1.0535231692044031, + "grad_norm": 0.14869382977485657, + "learning_rate": 9.470906452038528e-05, + "loss": 0.009, + "step": 3995 + }, + { + "epoch": 1.0548414738646101, + "grad_norm": 0.33614659309387207, + "learning_rate": 9.457712099221534e-05, + "loss": 0.0555, + "step": 4000 + }, + { + "epoch": 1.0548414738646101, + "eval_loss": 0.026165226474404335, + "eval_runtime": 452.2482, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 3.728, + "step": 4000 + } + ], + "logging_steps": 5, + "max_steps": 7584, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.2285640340572365e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4000/training_args.bin b/checkpoint-4000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4c05e0585b1aef03c0cbe4c50207ce04940bd838 --- /dev/null +++ b/checkpoint-4000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79dfa687fdd0c9908ab6b63535817e7567b29b0b483ac228723218f6f5fdeec5 +size 5688 diff --git a/checkpoint-4500/README.md b/checkpoint-4500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e738b5eda9883f4a45efd9d37b8b8357ba758456 --- /dev/null +++ b/checkpoint-4500/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/Llama-3.2-3B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-4500/adapter_config.json b/checkpoint-4500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..852d02d4dc1204f4332be8b3363041a3d3e3feb3 --- /dev/null +++ b/checkpoint-4500/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/Llama-3.2-3B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "gate_proj", + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4500/adapter_model.safetensors b/checkpoint-4500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3e1cd35b522d05c88a455cc816c6163ebfd4d214 --- /dev/null +++ b/checkpoint-4500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b818140e2ca56a90e334bcaf271a3b4830d0c22b6df6664bff661541ea3a9050 +size 97307544 diff --git a/checkpoint-4500/optimizer.pt b/checkpoint-4500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..50dd0088027da20086b4235cab07696a6ce8a0fe --- /dev/null +++ b/checkpoint-4500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ed88048c7583f348e0f73f28fee1f9e103a96a1a5f1bcbc93ee6393ecb76487 +size 50866370 diff --git a/checkpoint-4500/rng_state.pth b/checkpoint-4500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ebebe3a8047a8c804e8036186017e8bb6aca806b --- /dev/null +++ b/checkpoint-4500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:004e07e622390b3ef93cd3c314e8250117d865d4c97d71719da5ef24b150050b +size 14244 diff --git a/checkpoint-4500/scheduler.pt b/checkpoint-4500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..33404f9114cb2dad5eb3d2d4143ddbbcafcad153 --- /dev/null +++ b/checkpoint-4500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:649a2d48b1a864be3de65c41d9e7ac16af9949b7390956cd6ba584bbafb2e92c +size 1064 diff --git a/checkpoint-4500/special_tokens_map.json b/checkpoint-4500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-4500/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-4500/tokenizer.json b/checkpoint-4500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-4500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-4500/tokenizer_config.json b/checkpoint-4500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfd835b0ef1033e89629af6e13ae45397e9fa2f8 --- /dev/null +++ b/checkpoint-4500/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 July 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-4500/trainer_state.json b/checkpoint-4500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1701e54b3580f690ce09637eac7be861f6728f83 --- /dev/null +++ b/checkpoint-4500/trainer_state.json @@ -0,0 +1,6408 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.1866719398853074, + "eval_steps": 500, + "global_step": 4500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001318304660206974, + "grad_norm": 4.59375, + "learning_rate": 0.0002, + "loss": 1.9624, + "step": 5 + }, + { + "epoch": 0.002636609320413948, + "grad_norm": 1.7421875, + "learning_rate": 0.00019986805647183008, + "loss": 0.6513, + "step": 10 + }, + { + "epoch": 0.003954913980620921, + "grad_norm": 1.84375, + "learning_rate": 0.00019973611294366012, + "loss": 0.1146, + "step": 15 + }, + { + "epoch": 0.005273218640827896, + "grad_norm": 1.3203125, + "learning_rate": 0.0001996041694154902, + "loss": 0.0529, + "step": 20 + }, + { + "epoch": 0.006591523301034869, + "grad_norm": 0.40234375, + "learning_rate": 0.00019947222588732023, + "loss": 0.1214, + "step": 25 + }, + { + "epoch": 0.007909827961241843, + "grad_norm": 1.5390625, + "learning_rate": 0.0001993402823591503, + "loss": 0.0919, + "step": 30 + }, + { + "epoch": 0.009228132621448816, + "grad_norm": 0.06201171875, + "learning_rate": 0.00019920833883098034, + "loss": 0.09, + "step": 35 + }, + { + "epoch": 0.010546437281655791, + "grad_norm": 1.53125, + "learning_rate": 0.0001990763953028104, + "loss": 0.1945, + "step": 40 + }, + { + "epoch": 0.011864741941862765, + "grad_norm": 0.2890625, + "learning_rate": 0.00019894445177464048, + "loss": 0.1259, + "step": 45 + }, + { + "epoch": 0.013183046602069738, + "grad_norm": 0.609375, + "learning_rate": 0.00019881250824647052, + "loss": 0.027, + "step": 50 + }, + { + "epoch": 0.014501351262276712, + "grad_norm": 0.369140625, + "learning_rate": 0.00019868056471830057, + "loss": 0.1068, + "step": 55 + }, + { + "epoch": 0.015819655922483685, + "grad_norm": 0.34765625, + "learning_rate": 0.00019854862119013064, + "loss": 0.0542, + "step": 60 + }, + { + "epoch": 0.01713796058269066, + "grad_norm": 0.055419921875, + "learning_rate": 0.00019841667766196068, + "loss": 0.0901, + "step": 65 + }, + { + "epoch": 0.018456265242897632, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00019828473413379075, + "loss": 0.0091, + "step": 70 + }, + { + "epoch": 0.019774569903104607, + "grad_norm": 0.0079345703125, + "learning_rate": 0.0001981527906056208, + "loss": 0.0744, + "step": 75 + }, + { + "epoch": 0.021092874563311582, + "grad_norm": 0.65234375, + "learning_rate": 0.00019802084707745086, + "loss": 0.1108, + "step": 80 + }, + { + "epoch": 0.022411179223518554, + "grad_norm": 0.50390625, + "learning_rate": 0.0001978889035492809, + "loss": 0.0446, + "step": 85 + }, + { + "epoch": 0.02372948388372553, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019775696002111097, + "loss": 0.0982, + "step": 90 + }, + { + "epoch": 0.0250477885439325, + "grad_norm": 0.490234375, + "learning_rate": 0.00019762501649294104, + "loss": 0.1035, + "step": 95 + }, + { + "epoch": 0.026366093204139476, + "grad_norm": 0.12158203125, + "learning_rate": 0.00019749307296477108, + "loss": 0.0401, + "step": 100 + }, + { + "epoch": 0.02768439786434645, + "grad_norm": 0.16015625, + "learning_rate": 0.00019736112943660115, + "loss": 0.0309, + "step": 105 + }, + { + "epoch": 0.029002702524553423, + "grad_norm": 1.359375, + "learning_rate": 0.0001972291859084312, + "loss": 0.1032, + "step": 110 + }, + { + "epoch": 0.0303210071847604, + "grad_norm": 0.52734375, + "learning_rate": 0.00019709724238026126, + "loss": 0.0811, + "step": 115 + }, + { + "epoch": 0.03163931184496737, + "grad_norm": 0.177734375, + "learning_rate": 0.00019696529885209133, + "loss": 0.0258, + "step": 120 + }, + { + "epoch": 0.03295761650517435, + "grad_norm": 0.234375, + "learning_rate": 0.00019683335532392137, + "loss": 0.0437, + "step": 125 + }, + { + "epoch": 0.03427592116538132, + "grad_norm": 1.3046875, + "learning_rate": 0.00019670141179575144, + "loss": 0.0967, + "step": 130 + }, + { + "epoch": 0.03559422582558829, + "grad_norm": 0.2734375, + "learning_rate": 0.00019656946826758148, + "loss": 0.0132, + "step": 135 + }, + { + "epoch": 0.036912530485795264, + "grad_norm": 0.66015625, + "learning_rate": 0.00019643752473941155, + "loss": 0.0396, + "step": 140 + }, + { + "epoch": 0.03823083514600224, + "grad_norm": 1.0546875, + "learning_rate": 0.0001963055812112416, + "loss": 0.0449, + "step": 145 + }, + { + "epoch": 0.039549139806209214, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019617363768307166, + "loss": 0.1196, + "step": 150 + }, + { + "epoch": 0.040867444466416186, + "grad_norm": 0.5859375, + "learning_rate": 0.0001960416941549017, + "loss": 0.0588, + "step": 155 + }, + { + "epoch": 0.042185749126623165, + "grad_norm": 0.06005859375, + "learning_rate": 0.00019590975062673175, + "loss": 0.0234, + "step": 160 + }, + { + "epoch": 0.04350405378683014, + "grad_norm": 0.4921875, + "learning_rate": 0.00019577780709856182, + "loss": 0.0916, + "step": 165 + }, + { + "epoch": 0.04482235844703711, + "grad_norm": 0.84375, + "learning_rate": 0.0001956458635703919, + "loss": 0.0271, + "step": 170 + }, + { + "epoch": 0.04614066310724409, + "grad_norm": 0.8828125, + "learning_rate": 0.00019551392004222193, + "loss": 0.0175, + "step": 175 + }, + { + "epoch": 0.04745896776745106, + "grad_norm": 0.0152587890625, + "learning_rate": 0.000195381976514052, + "loss": 0.0356, + "step": 180 + }, + { + "epoch": 0.04877727242765803, + "grad_norm": 0.09326171875, + "learning_rate": 0.00019525003298588204, + "loss": 0.0057, + "step": 185 + }, + { + "epoch": 0.050095577087865, + "grad_norm": 0.24609375, + "learning_rate": 0.0001951180894577121, + "loss": 0.0082, + "step": 190 + }, + { + "epoch": 0.05141388174807198, + "grad_norm": 0.05029296875, + "learning_rate": 0.00019498614592954215, + "loss": 0.0178, + "step": 195 + }, + { + "epoch": 0.05273218640827895, + "grad_norm": 0.0390625, + "learning_rate": 0.00019485420240137222, + "loss": 0.0789, + "step": 200 + }, + { + "epoch": 0.054050491068485924, + "grad_norm": 0.5625, + "learning_rate": 0.0001947222588732023, + "loss": 0.0645, + "step": 205 + }, + { + "epoch": 0.0553687957286929, + "grad_norm": 0.53515625, + "learning_rate": 0.00019459031534503233, + "loss": 0.116, + "step": 210 + }, + { + "epoch": 0.056687100388899875, + "grad_norm": 0.55078125, + "learning_rate": 0.0001944583718168624, + "loss": 0.0516, + "step": 215 + }, + { + "epoch": 0.058005405049106847, + "grad_norm": 0.314453125, + "learning_rate": 0.00019432642828869244, + "loss": 0.1019, + "step": 220 + }, + { + "epoch": 0.059323709709313825, + "grad_norm": 0.1123046875, + "learning_rate": 0.0001941944847605225, + "loss": 0.0529, + "step": 225 + }, + { + "epoch": 0.0606420143695208, + "grad_norm": 0.4921875, + "learning_rate": 0.00019406254123235256, + "loss": 0.0368, + "step": 230 + }, + { + "epoch": 0.06196031902972777, + "grad_norm": 0.054443359375, + "learning_rate": 0.00019393059770418262, + "loss": 0.037, + "step": 235 + }, + { + "epoch": 0.06327862368993474, + "grad_norm": 0.008544921875, + "learning_rate": 0.0001937986541760127, + "loss": 0.0324, + "step": 240 + }, + { + "epoch": 0.06459692835014172, + "grad_norm": 1.5, + "learning_rate": 0.00019366671064784274, + "loss": 0.0334, + "step": 245 + }, + { + "epoch": 0.0659152330103487, + "grad_norm": 0.2109375, + "learning_rate": 0.0001935347671196728, + "loss": 0.0671, + "step": 250 + }, + { + "epoch": 0.06723353767055566, + "grad_norm": 2.0625, + "learning_rate": 0.00019340282359150285, + "loss": 0.1559, + "step": 255 + }, + { + "epoch": 0.06855184233076264, + "grad_norm": 0.7734375, + "learning_rate": 0.0001932708800633329, + "loss": 0.0198, + "step": 260 + }, + { + "epoch": 0.06987014699096962, + "grad_norm": 0.42578125, + "learning_rate": 0.00019313893653516296, + "loss": 0.0151, + "step": 265 + }, + { + "epoch": 0.07118845165117658, + "grad_norm": 0.1884765625, + "learning_rate": 0.000193006993006993, + "loss": 0.0269, + "step": 270 + }, + { + "epoch": 0.07250675631138356, + "grad_norm": 1.546875, + "learning_rate": 0.00019287504947882307, + "loss": 0.0565, + "step": 275 + }, + { + "epoch": 0.07382506097159053, + "grad_norm": 0.5078125, + "learning_rate": 0.0001927431059506531, + "loss": 0.0942, + "step": 280 + }, + { + "epoch": 0.0751433656317975, + "grad_norm": 0.392578125, + "learning_rate": 0.00019261116242248318, + "loss": 0.0061, + "step": 285 + }, + { + "epoch": 0.07646167029200449, + "grad_norm": 1.9140625, + "learning_rate": 0.00019247921889431325, + "loss": 0.0497, + "step": 290 + }, + { + "epoch": 0.07777997495221145, + "grad_norm": 0.08837890625, + "learning_rate": 0.0001923472753661433, + "loss": 0.0573, + "step": 295 + }, + { + "epoch": 0.07909827961241843, + "grad_norm": 1.046875, + "learning_rate": 0.00019221533183797336, + "loss": 0.0528, + "step": 300 + }, + { + "epoch": 0.08041658427262541, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001920833883098034, + "loss": 0.0506, + "step": 305 + }, + { + "epoch": 0.08173488893283237, + "grad_norm": 0.08203125, + "learning_rate": 0.00019195144478163347, + "loss": 0.0307, + "step": 310 + }, + { + "epoch": 0.08305319359303935, + "grad_norm": 0.111328125, + "learning_rate": 0.00019181950125346354, + "loss": 0.0365, + "step": 315 + }, + { + "epoch": 0.08437149825324633, + "grad_norm": 1.2890625, + "learning_rate": 0.00019168755772529358, + "loss": 0.0447, + "step": 320 + }, + { + "epoch": 0.0856898029134533, + "grad_norm": 0.6015625, + "learning_rate": 0.00019155561419712365, + "loss": 0.0605, + "step": 325 + }, + { + "epoch": 0.08700810757366027, + "grad_norm": 0.71875, + "learning_rate": 0.0001914236706689537, + "loss": 0.0846, + "step": 330 + }, + { + "epoch": 0.08832641223386725, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019129172714078376, + "loss": 0.0713, + "step": 335 + }, + { + "epoch": 0.08964471689407422, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001911597836126138, + "loss": 0.0826, + "step": 340 + }, + { + "epoch": 0.0909630215542812, + "grad_norm": 2.203125, + "learning_rate": 0.00019102784008444388, + "loss": 0.0441, + "step": 345 + }, + { + "epoch": 0.09228132621448817, + "grad_norm": 1.21875, + "learning_rate": 0.00019089589655627395, + "loss": 0.1378, + "step": 350 + }, + { + "epoch": 0.09359963087469514, + "grad_norm": 3.0625, + "learning_rate": 0.00019076395302810396, + "loss": 0.1552, + "step": 355 + }, + { + "epoch": 0.09491793553490212, + "grad_norm": 0.232421875, + "learning_rate": 0.00019063200949993403, + "loss": 0.0458, + "step": 360 + }, + { + "epoch": 0.0962362401951091, + "grad_norm": 0.71875, + "learning_rate": 0.0001905000659717641, + "loss": 0.0312, + "step": 365 + }, + { + "epoch": 0.09755454485531606, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00019036812244359414, + "loss": 0.0247, + "step": 370 + }, + { + "epoch": 0.09887284951552304, + "grad_norm": 0.064453125, + "learning_rate": 0.0001902361789154242, + "loss": 0.054, + "step": 375 + }, + { + "epoch": 0.10019115417573, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019010423538725425, + "loss": 0.0023, + "step": 380 + }, + { + "epoch": 0.10150945883593698, + "grad_norm": 0.0361328125, + "learning_rate": 0.00018997229185908432, + "loss": 0.0884, + "step": 385 + }, + { + "epoch": 0.10282776349614396, + "grad_norm": 1.703125, + "learning_rate": 0.00018984034833091436, + "loss": 0.0506, + "step": 390 + }, + { + "epoch": 0.10414606815635093, + "grad_norm": 0.08837890625, + "learning_rate": 0.00018970840480274443, + "loss": 0.1123, + "step": 395 + }, + { + "epoch": 0.1054643728165579, + "grad_norm": 0.6953125, + "learning_rate": 0.0001895764612745745, + "loss": 0.0597, + "step": 400 + }, + { + "epoch": 0.10678267747676488, + "grad_norm": 0.18359375, + "learning_rate": 0.00018944451774640454, + "loss": 0.0138, + "step": 405 + }, + { + "epoch": 0.10810098213697185, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001893125742182346, + "loss": 0.0249, + "step": 410 + }, + { + "epoch": 0.10941928679717883, + "grad_norm": 0.00970458984375, + "learning_rate": 0.00018918063069006466, + "loss": 0.0084, + "step": 415 + }, + { + "epoch": 0.1107375914573858, + "grad_norm": 0.54296875, + "learning_rate": 0.00018904868716189472, + "loss": 0.0541, + "step": 420 + }, + { + "epoch": 0.11205589611759277, + "grad_norm": 0.74609375, + "learning_rate": 0.00018891674363372477, + "loss": 0.007, + "step": 425 + }, + { + "epoch": 0.11337420077779975, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00018878480010555484, + "loss": 0.0875, + "step": 430 + }, + { + "epoch": 0.11469250543800673, + "grad_norm": 0.9296875, + "learning_rate": 0.0001886528565773849, + "loss": 0.1207, + "step": 435 + }, + { + "epoch": 0.11601081009821369, + "grad_norm": 1.2734375, + "learning_rate": 0.00018852091304921495, + "loss": 0.1143, + "step": 440 + }, + { + "epoch": 0.11732911475842067, + "grad_norm": 0.6484375, + "learning_rate": 0.00018838896952104502, + "loss": 0.0393, + "step": 445 + }, + { + "epoch": 0.11864741941862765, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018825702599287506, + "loss": 0.02, + "step": 450 + }, + { + "epoch": 0.11996572407883462, + "grad_norm": 0.486328125, + "learning_rate": 0.0001881250824647051, + "loss": 0.0891, + "step": 455 + }, + { + "epoch": 0.1212840287390416, + "grad_norm": 1.0, + "learning_rate": 0.00018799313893653517, + "loss": 0.0469, + "step": 460 + }, + { + "epoch": 0.12260233339924857, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001878611954083652, + "loss": 0.019, + "step": 465 + }, + { + "epoch": 0.12392063805945554, + "grad_norm": 0.03857421875, + "learning_rate": 0.00018772925188019528, + "loss": 0.007, + "step": 470 + }, + { + "epoch": 0.12523894271966252, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00018759730835202532, + "loss": 0.0039, + "step": 475 + }, + { + "epoch": 0.12655724737986948, + "grad_norm": 0.014404296875, + "learning_rate": 0.0001874653648238554, + "loss": 0.0043, + "step": 480 + }, + { + "epoch": 0.12787555204007647, + "grad_norm": 0.51953125, + "learning_rate": 0.00018733342129568546, + "loss": 0.1326, + "step": 485 + }, + { + "epoch": 0.12919385670028344, + "grad_norm": 0.99609375, + "learning_rate": 0.0001872014777675155, + "loss": 0.0369, + "step": 490 + }, + { + "epoch": 0.1305121613604904, + "grad_norm": 0.2734375, + "learning_rate": 0.00018706953423934557, + "loss": 0.0395, + "step": 495 + }, + { + "epoch": 0.1318304660206974, + "grad_norm": 0.083984375, + "learning_rate": 0.00018693759071117561, + "loss": 0.0284, + "step": 500 + }, + { + "epoch": 0.1318304660206974, + "eval_loss": 0.04542969539761543, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.5293, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 3.685, + "step": 500 + }, + { + "epoch": 0.13314877068090436, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00018680564718300568, + "loss": 0.0533, + "step": 505 + }, + { + "epoch": 0.13446707534111133, + "grad_norm": 0.71484375, + "learning_rate": 0.00018667370365483575, + "loss": 0.0183, + "step": 510 + }, + { + "epoch": 0.13578538000131832, + "grad_norm": 0.018798828125, + "learning_rate": 0.0001865417601266658, + "loss": 0.0473, + "step": 515 + }, + { + "epoch": 0.13710368466152528, + "grad_norm": 0.388671875, + "learning_rate": 0.00018640981659849586, + "loss": 0.0562, + "step": 520 + }, + { + "epoch": 0.13842198932173225, + "grad_norm": 0.77734375, + "learning_rate": 0.0001862778730703259, + "loss": 0.0755, + "step": 525 + }, + { + "epoch": 0.13974029398193924, + "grad_norm": 2.8125, + "learning_rate": 0.00018614592954215598, + "loss": 0.0422, + "step": 530 + }, + { + "epoch": 0.1410585986421462, + "grad_norm": 0.48828125, + "learning_rate": 0.00018601398601398602, + "loss": 0.0882, + "step": 535 + }, + { + "epoch": 0.14237690330235317, + "grad_norm": 0.16015625, + "learning_rate": 0.0001858820424858161, + "loss": 0.0131, + "step": 540 + }, + { + "epoch": 0.14369520796256013, + "grad_norm": 0.31640625, + "learning_rate": 0.00018575009895764616, + "loss": 0.03, + "step": 545 + }, + { + "epoch": 0.14501351262276713, + "grad_norm": 0.0120849609375, + "learning_rate": 0.0001856181554294762, + "loss": 0.0425, + "step": 550 + }, + { + "epoch": 0.1463318172829741, + "grad_norm": 0.390625, + "learning_rate": 0.00018548621190130624, + "loss": 0.011, + "step": 555 + }, + { + "epoch": 0.14765012194318106, + "grad_norm": 1.9609375, + "learning_rate": 0.0001853542683731363, + "loss": 0.0807, + "step": 560 + }, + { + "epoch": 0.14896842660338805, + "grad_norm": 0.609375, + "learning_rate": 0.00018522232484496635, + "loss": 0.0278, + "step": 565 + }, + { + "epoch": 0.150286731263595, + "grad_norm": 0.087890625, + "learning_rate": 0.00018509038131679642, + "loss": 0.0484, + "step": 570 + }, + { + "epoch": 0.15160503592380198, + "grad_norm": 0.5078125, + "learning_rate": 0.00018495843778862646, + "loss": 0.1277, + "step": 575 + }, + { + "epoch": 0.15292334058400897, + "grad_norm": 0.8125, + "learning_rate": 0.00018482649426045653, + "loss": 0.058, + "step": 580 + }, + { + "epoch": 0.15424164524421594, + "grad_norm": 0.22265625, + "learning_rate": 0.00018469455073228657, + "loss": 0.0259, + "step": 585 + }, + { + "epoch": 0.1555599499044229, + "grad_norm": 1.8984375, + "learning_rate": 0.00018456260720411664, + "loss": 0.113, + "step": 590 + }, + { + "epoch": 0.1568782545646299, + "grad_norm": 0.12451171875, + "learning_rate": 0.0001844306636759467, + "loss": 0.0312, + "step": 595 + }, + { + "epoch": 0.15819655922483686, + "grad_norm": 0.0322265625, + "learning_rate": 0.00018429872014777676, + "loss": 0.0476, + "step": 600 + }, + { + "epoch": 0.15951486388504382, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00018416677661960682, + "loss": 0.0232, + "step": 605 + }, + { + "epoch": 0.16083316854525082, + "grad_norm": 0.57421875, + "learning_rate": 0.00018403483309143687, + "loss": 0.1287, + "step": 610 + }, + { + "epoch": 0.16215147320545778, + "grad_norm": 0.765625, + "learning_rate": 0.00018390288956326694, + "loss": 0.0991, + "step": 615 + }, + { + "epoch": 0.16346977786566474, + "grad_norm": 0.3125, + "learning_rate": 0.00018377094603509698, + "loss": 0.0247, + "step": 620 + }, + { + "epoch": 0.16478808252587174, + "grad_norm": 0.37890625, + "learning_rate": 0.00018363900250692705, + "loss": 0.0632, + "step": 625 + }, + { + "epoch": 0.1661063871860787, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018350705897875712, + "loss": 0.0314, + "step": 630 + }, + { + "epoch": 0.16742469184628567, + "grad_norm": 0.0673828125, + "learning_rate": 0.00018337511545058716, + "loss": 0.0425, + "step": 635 + }, + { + "epoch": 0.16874299650649266, + "grad_norm": 0.396484375, + "learning_rate": 0.00018324317192241723, + "loss": 0.0613, + "step": 640 + }, + { + "epoch": 0.17006130116669962, + "grad_norm": 0.057373046875, + "learning_rate": 0.00018311122839424727, + "loss": 0.0569, + "step": 645 + }, + { + "epoch": 0.1713796058269066, + "grad_norm": 0.001373291015625, + "learning_rate": 0.00018297928486607734, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 0.17269791048711358, + "grad_norm": 1.0859375, + "learning_rate": 0.00018284734133790738, + "loss": 0.0189, + "step": 655 + }, + { + "epoch": 0.17401621514732055, + "grad_norm": 0.6015625, + "learning_rate": 0.00018271539780973742, + "loss": 0.0601, + "step": 660 + }, + { + "epoch": 0.1753345198075275, + "grad_norm": 0.25390625, + "learning_rate": 0.0001825834542815675, + "loss": 0.0211, + "step": 665 + }, + { + "epoch": 0.1766528244677345, + "grad_norm": 2.6875, + "learning_rate": 0.00018245151075339753, + "loss": 0.0713, + "step": 670 + }, + { + "epoch": 0.17797112912794147, + "grad_norm": 1.1875, + "learning_rate": 0.0001823195672252276, + "loss": 0.0522, + "step": 675 + }, + { + "epoch": 0.17928943378814843, + "grad_norm": 0.025146484375, + "learning_rate": 0.00018218762369705767, + "loss": 0.0242, + "step": 680 + }, + { + "epoch": 0.18060773844835543, + "grad_norm": 0.048095703125, + "learning_rate": 0.00018205568016888772, + "loss": 0.0129, + "step": 685 + }, + { + "epoch": 0.1819260431085624, + "grad_norm": 0.04541015625, + "learning_rate": 0.00018192373664071778, + "loss": 0.0142, + "step": 690 + }, + { + "epoch": 0.18324434776876936, + "grad_norm": 0.00830078125, + "learning_rate": 0.00018179179311254783, + "loss": 0.0121, + "step": 695 + }, + { + "epoch": 0.18456265242897635, + "grad_norm": 0.53125, + "learning_rate": 0.0001816598495843779, + "loss": 0.0163, + "step": 700 + }, + { + "epoch": 0.1858809570891833, + "grad_norm": 0.185546875, + "learning_rate": 0.00018152790605620796, + "loss": 0.0203, + "step": 705 + }, + { + "epoch": 0.18719926174939028, + "grad_norm": 1.2578125, + "learning_rate": 0.000181395962528038, + "loss": 0.1548, + "step": 710 + }, + { + "epoch": 0.18851756640959727, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00018126401899986808, + "loss": 0.0543, + "step": 715 + }, + { + "epoch": 0.18983587106980424, + "grad_norm": 0.07568359375, + "learning_rate": 0.00018113207547169812, + "loss": 0.0346, + "step": 720 + }, + { + "epoch": 0.1911541757300112, + "grad_norm": 0.1318359375, + "learning_rate": 0.0001810001319435282, + "loss": 0.03, + "step": 725 + }, + { + "epoch": 0.1924724803902182, + "grad_norm": 0.1455078125, + "learning_rate": 0.00018086818841535823, + "loss": 0.0796, + "step": 730 + }, + { + "epoch": 0.19379078505042516, + "grad_norm": 0.09814453125, + "learning_rate": 0.0001807362448871883, + "loss": 0.0662, + "step": 735 + }, + { + "epoch": 0.19510908971063212, + "grad_norm": 0.91015625, + "learning_rate": 0.00018060430135901837, + "loss": 0.0675, + "step": 740 + }, + { + "epoch": 0.19642739437083911, + "grad_norm": 0.10693359375, + "learning_rate": 0.0001804723578308484, + "loss": 0.0377, + "step": 745 + }, + { + "epoch": 0.19774569903104608, + "grad_norm": 0.95703125, + "learning_rate": 0.00018034041430267848, + "loss": 0.0174, + "step": 750 + }, + { + "epoch": 0.19906400369125304, + "grad_norm": 1.7890625, + "learning_rate": 0.00018020847077450852, + "loss": 0.0278, + "step": 755 + }, + { + "epoch": 0.20038230835146, + "grad_norm": 0.8515625, + "learning_rate": 0.00018007652724633856, + "loss": 0.0113, + "step": 760 + }, + { + "epoch": 0.201700613011667, + "grad_norm": 0.016845703125, + "learning_rate": 0.00017994458371816863, + "loss": 0.0589, + "step": 765 + }, + { + "epoch": 0.20301891767187397, + "grad_norm": 0.01043701171875, + "learning_rate": 0.00017981264018999867, + "loss": 0.0203, + "step": 770 + }, + { + "epoch": 0.20433722233208093, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00017968069666182874, + "loss": 0.0494, + "step": 775 + }, + { + "epoch": 0.20565552699228792, + "grad_norm": 0.56640625, + "learning_rate": 0.00017954875313365879, + "loss": 0.0394, + "step": 780 + }, + { + "epoch": 0.2069738316524949, + "grad_norm": 0.06591796875, + "learning_rate": 0.00017941680960548886, + "loss": 0.0848, + "step": 785 + }, + { + "epoch": 0.20829213631270185, + "grad_norm": 0.40234375, + "learning_rate": 0.00017928486607731892, + "loss": 0.0464, + "step": 790 + }, + { + "epoch": 0.20961044097290885, + "grad_norm": 0.06298828125, + "learning_rate": 0.00017915292254914897, + "loss": 0.0222, + "step": 795 + }, + { + "epoch": 0.2109287456331158, + "grad_norm": 0.5390625, + "learning_rate": 0.00017902097902097904, + "loss": 0.0434, + "step": 800 + }, + { + "epoch": 0.21224705029332278, + "grad_norm": 1.390625, + "learning_rate": 0.00017888903549280908, + "loss": 0.0222, + "step": 805 + }, + { + "epoch": 0.21356535495352977, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00017875709196463915, + "loss": 0.0099, + "step": 810 + }, + { + "epoch": 0.21488365961373673, + "grad_norm": 0.10009765625, + "learning_rate": 0.0001786251484364692, + "loss": 0.0086, + "step": 815 + }, + { + "epoch": 0.2162019642739437, + "grad_norm": 0.06396484375, + "learning_rate": 0.00017849320490829926, + "loss": 0.0715, + "step": 820 + }, + { + "epoch": 0.2175202689341507, + "grad_norm": 0.365234375, + "learning_rate": 0.00017836126138012933, + "loss": 0.0642, + "step": 825 + }, + { + "epoch": 0.21883857359435765, + "grad_norm": 0.01519775390625, + "learning_rate": 0.00017822931785195937, + "loss": 0.0111, + "step": 830 + }, + { + "epoch": 0.22015687825456462, + "grad_norm": 1.1640625, + "learning_rate": 0.00017809737432378944, + "loss": 0.0518, + "step": 835 + }, + { + "epoch": 0.2214751829147716, + "grad_norm": 0.00921630859375, + "learning_rate": 0.00017796543079561948, + "loss": 0.0384, + "step": 840 + }, + { + "epoch": 0.22279348757497858, + "grad_norm": 0.33984375, + "learning_rate": 0.00017783348726744955, + "loss": 0.0204, + "step": 845 + }, + { + "epoch": 0.22411179223518554, + "grad_norm": 0.294921875, + "learning_rate": 0.00017770154373927962, + "loss": 0.0075, + "step": 850 + }, + { + "epoch": 0.22543009689539253, + "grad_norm": 0.033203125, + "learning_rate": 0.00017756960021110963, + "loss": 0.0895, + "step": 855 + }, + { + "epoch": 0.2267484015555995, + "grad_norm": 0.08056640625, + "learning_rate": 0.0001774376566829397, + "loss": 0.1039, + "step": 860 + }, + { + "epoch": 0.22806670621580646, + "grad_norm": 0.55078125, + "learning_rate": 0.00017730571315476975, + "loss": 0.0125, + "step": 865 + }, + { + "epoch": 0.22938501087601346, + "grad_norm": 0.5859375, + "learning_rate": 0.00017717376962659982, + "loss": 0.0381, + "step": 870 + }, + { + "epoch": 0.23070331553622042, + "grad_norm": 0.029052734375, + "learning_rate": 0.00017704182609842988, + "loss": 0.0434, + "step": 875 + }, + { + "epoch": 0.23202162019642739, + "grad_norm": 0.43359375, + "learning_rate": 0.00017690988257025993, + "loss": 0.0799, + "step": 880 + }, + { + "epoch": 0.23333992485663438, + "grad_norm": 0.04150390625, + "learning_rate": 0.00017677793904209, + "loss": 0.0692, + "step": 885 + }, + { + "epoch": 0.23465822951684134, + "grad_norm": 0.435546875, + "learning_rate": 0.00017664599551392004, + "loss": 0.0544, + "step": 890 + }, + { + "epoch": 0.2359765341770483, + "grad_norm": 1.171875, + "learning_rate": 0.0001765140519857501, + "loss": 0.0619, + "step": 895 + }, + { + "epoch": 0.2372948388372553, + "grad_norm": 0.01263427734375, + "learning_rate": 0.00017638210845758018, + "loss": 0.0418, + "step": 900 + }, + { + "epoch": 0.23861314349746227, + "grad_norm": 0.017578125, + "learning_rate": 0.00017625016492941022, + "loss": 0.0195, + "step": 905 + }, + { + "epoch": 0.23993144815766923, + "grad_norm": 0.6171875, + "learning_rate": 0.0001761182214012403, + "loss": 0.067, + "step": 910 + }, + { + "epoch": 0.24124975281787622, + "grad_norm": 0.59765625, + "learning_rate": 0.00017598627787307033, + "loss": 0.049, + "step": 915 + }, + { + "epoch": 0.2425680574780832, + "grad_norm": 1.2421875, + "learning_rate": 0.0001758543343449004, + "loss": 0.0539, + "step": 920 + }, + { + "epoch": 0.24388636213829015, + "grad_norm": 0.10302734375, + "learning_rate": 0.00017572239081673044, + "loss": 0.0725, + "step": 925 + }, + { + "epoch": 0.24520466679849715, + "grad_norm": 0.330078125, + "learning_rate": 0.0001755904472885605, + "loss": 0.064, + "step": 930 + }, + { + "epoch": 0.2465229714587041, + "grad_norm": 0.220703125, + "learning_rate": 0.00017545850376039058, + "loss": 0.0271, + "step": 935 + }, + { + "epoch": 0.24784127611891107, + "grad_norm": 0.01470947265625, + "learning_rate": 0.00017532656023222062, + "loss": 0.0247, + "step": 940 + }, + { + "epoch": 0.24915958077911807, + "grad_norm": 0.013427734375, + "learning_rate": 0.0001751946167040507, + "loss": 0.017, + "step": 945 + }, + { + "epoch": 0.25047788543932503, + "grad_norm": 0.58984375, + "learning_rate": 0.00017506267317588073, + "loss": 0.0254, + "step": 950 + }, + { + "epoch": 0.251796190099532, + "grad_norm": 0.412109375, + "learning_rate": 0.00017493072964771078, + "loss": 0.0186, + "step": 955 + }, + { + "epoch": 0.25311449475973896, + "grad_norm": 0.66796875, + "learning_rate": 0.00017479878611954084, + "loss": 0.0617, + "step": 960 + }, + { + "epoch": 0.25443279941994595, + "grad_norm": 0.322265625, + "learning_rate": 0.00017466684259137089, + "loss": 0.0173, + "step": 965 + }, + { + "epoch": 0.25575110408015295, + "grad_norm": 0.83203125, + "learning_rate": 0.00017453489906320096, + "loss": 0.0512, + "step": 970 + }, + { + "epoch": 0.2570694087403599, + "grad_norm": 0.08447265625, + "learning_rate": 0.000174402955535031, + "loss": 0.0361, + "step": 975 + }, + { + "epoch": 0.2583877134005669, + "grad_norm": 0.423828125, + "learning_rate": 0.00017427101200686107, + "loss": 0.0175, + "step": 980 + }, + { + "epoch": 0.25970601806077387, + "grad_norm": 0.77734375, + "learning_rate": 0.00017413906847869114, + "loss": 0.0139, + "step": 985 + }, + { + "epoch": 0.2610243227209808, + "grad_norm": 0.515625, + "learning_rate": 0.00017400712495052118, + "loss": 0.0948, + "step": 990 + }, + { + "epoch": 0.2623426273811878, + "grad_norm": 1.421875, + "learning_rate": 0.00017387518142235125, + "loss": 0.0406, + "step": 995 + }, + { + "epoch": 0.2636609320413948, + "grad_norm": 0.058837890625, + "learning_rate": 0.0001737432378941813, + "loss": 0.1011, + "step": 1000 + }, + { + "epoch": 0.2636609320413948, + "eval_loss": 0.045552924275398254, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.6113, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 3.684, + "step": 1000 + }, + { + "epoch": 0.26497923670160173, + "grad_norm": 0.380859375, + "learning_rate": 0.00017361129436601136, + "loss": 0.0711, + "step": 1005 + }, + { + "epoch": 0.2662975413618087, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00017347935083784143, + "loss": 0.0218, + "step": 1010 + }, + { + "epoch": 0.2676158460220157, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017334740730967147, + "loss": 0.0301, + "step": 1015 + }, + { + "epoch": 0.26893415068222265, + "grad_norm": 0.2734375, + "learning_rate": 0.00017321546378150154, + "loss": 0.0721, + "step": 1020 + }, + { + "epoch": 0.27025245534242964, + "grad_norm": 0.25390625, + "learning_rate": 0.00017308352025333158, + "loss": 0.0363, + "step": 1025 + }, + { + "epoch": 0.27157076000263664, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017295157672516165, + "loss": 0.0313, + "step": 1030 + }, + { + "epoch": 0.2728890646628436, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0001728196331969917, + "loss": 0.0385, + "step": 1035 + }, + { + "epoch": 0.27420736932305056, + "grad_norm": 0.00787353515625, + "learning_rate": 0.00017268768966882176, + "loss": 0.0405, + "step": 1040 + }, + { + "epoch": 0.27552567398325756, + "grad_norm": 0.484375, + "learning_rate": 0.00017255574614065183, + "loss": 0.0616, + "step": 1045 + }, + { + "epoch": 0.2768439786434645, + "grad_norm": 0.0908203125, + "learning_rate": 0.00017242380261248185, + "loss": 0.0057, + "step": 1050 + }, + { + "epoch": 0.2781622833036715, + "grad_norm": 0.1904296875, + "learning_rate": 0.00017229185908431192, + "loss": 0.0417, + "step": 1055 + }, + { + "epoch": 0.2794805879638785, + "grad_norm": 0.30078125, + "learning_rate": 0.00017215991555614196, + "loss": 0.0346, + "step": 1060 + }, + { + "epoch": 0.2807988926240854, + "grad_norm": 0.016357421875, + "learning_rate": 0.00017202797202797203, + "loss": 0.0295, + "step": 1065 + }, + { + "epoch": 0.2821171972842924, + "grad_norm": 0.490234375, + "learning_rate": 0.0001718960284998021, + "loss": 0.0448, + "step": 1070 + }, + { + "epoch": 0.28343550194449935, + "grad_norm": 0.004241943359375, + "learning_rate": 0.00017176408497163214, + "loss": 0.0051, + "step": 1075 + }, + { + "epoch": 0.28475380660470634, + "grad_norm": 0.01904296875, + "learning_rate": 0.0001716321414434622, + "loss": 0.0894, + "step": 1080 + }, + { + "epoch": 0.28607211126491333, + "grad_norm": 0.83984375, + "learning_rate": 0.00017150019791529225, + "loss": 0.0288, + "step": 1085 + }, + { + "epoch": 0.28739041592512027, + "grad_norm": 0.2021484375, + "learning_rate": 0.00017136825438712232, + "loss": 0.0222, + "step": 1090 + }, + { + "epoch": 0.28870872058532726, + "grad_norm": 0.322265625, + "learning_rate": 0.0001712363108589524, + "loss": 0.0444, + "step": 1095 + }, + { + "epoch": 0.29002702524553425, + "grad_norm": 0.408203125, + "learning_rate": 0.00017110436733078243, + "loss": 0.0828, + "step": 1100 + }, + { + "epoch": 0.2913453299057412, + "grad_norm": 0.04052734375, + "learning_rate": 0.0001709724238026125, + "loss": 0.0725, + "step": 1105 + }, + { + "epoch": 0.2926636345659482, + "grad_norm": 0.2578125, + "learning_rate": 0.00017084048027444254, + "loss": 0.0204, + "step": 1110 + }, + { + "epoch": 0.2939819392261552, + "grad_norm": 0.67578125, + "learning_rate": 0.0001707085367462726, + "loss": 0.0503, + "step": 1115 + }, + { + "epoch": 0.2953002438863621, + "grad_norm": 0.0059814453125, + "learning_rate": 0.00017057659321810265, + "loss": 0.0144, + "step": 1120 + }, + { + "epoch": 0.2966185485465691, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00017044464968993272, + "loss": 0.0044, + "step": 1125 + }, + { + "epoch": 0.2979368532067761, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001703127061617628, + "loss": 0.013, + "step": 1130 + }, + { + "epoch": 0.29925515786698303, + "grad_norm": 0.287109375, + "learning_rate": 0.00017018076263359283, + "loss": 0.0245, + "step": 1135 + }, + { + "epoch": 0.30057346252719, + "grad_norm": 0.26171875, + "learning_rate": 0.0001700488191054229, + "loss": 0.0247, + "step": 1140 + }, + { + "epoch": 0.301891767187397, + "grad_norm": 0.40625, + "learning_rate": 0.00016991687557725294, + "loss": 0.0402, + "step": 1145 + }, + { + "epoch": 0.30321007184760396, + "grad_norm": 1.2578125, + "learning_rate": 0.000169784932049083, + "loss": 0.0071, + "step": 1150 + }, + { + "epoch": 0.30452837650781095, + "grad_norm": 0.330078125, + "learning_rate": 0.00016965298852091306, + "loss": 0.0177, + "step": 1155 + }, + { + "epoch": 0.30584668116801794, + "grad_norm": 0.07275390625, + "learning_rate": 0.0001695210449927431, + "loss": 0.0029, + "step": 1160 + }, + { + "epoch": 0.3071649858282249, + "grad_norm": 0.455078125, + "learning_rate": 0.00016938910146457317, + "loss": 0.0262, + "step": 1165 + }, + { + "epoch": 0.30848329048843187, + "grad_norm": 0.002655029296875, + "learning_rate": 0.0001692571579364032, + "loss": 0.0346, + "step": 1170 + }, + { + "epoch": 0.30980159514863886, + "grad_norm": 0.1748046875, + "learning_rate": 0.00016912521440823328, + "loss": 0.0494, + "step": 1175 + }, + { + "epoch": 0.3111198998088458, + "grad_norm": 1.4609375, + "learning_rate": 0.00016899327088006335, + "loss": 0.0603, + "step": 1180 + }, + { + "epoch": 0.3124382044690528, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001688613273518934, + "loss": 0.0366, + "step": 1185 + }, + { + "epoch": 0.3137565091292598, + "grad_norm": 0.01422119140625, + "learning_rate": 0.00016872938382372346, + "loss": 0.0678, + "step": 1190 + }, + { + "epoch": 0.3150748137894667, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001685974402955535, + "loss": 0.0359, + "step": 1195 + }, + { + "epoch": 0.3163931184496737, + "grad_norm": 0.275390625, + "learning_rate": 0.00016846549676738357, + "loss": 0.1099, + "step": 1200 + }, + { + "epoch": 0.3177114231098807, + "grad_norm": 0.212890625, + "learning_rate": 0.00016833355323921364, + "loss": 0.0343, + "step": 1205 + }, + { + "epoch": 0.31902972777008765, + "grad_norm": 0.0302734375, + "learning_rate": 0.00016820160971104368, + "loss": 0.0138, + "step": 1210 + }, + { + "epoch": 0.32034803243029464, + "grad_norm": 0.016845703125, + "learning_rate": 0.00016806966618287375, + "loss": 0.0202, + "step": 1215 + }, + { + "epoch": 0.32166633709050163, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001679377226547038, + "loss": 0.0442, + "step": 1220 + }, + { + "epoch": 0.32298464175070857, + "grad_norm": 0.049072265625, + "learning_rate": 0.00016780577912653386, + "loss": 0.0375, + "step": 1225 + }, + { + "epoch": 0.32430294641091556, + "grad_norm": 0.1337890625, + "learning_rate": 0.0001676738355983639, + "loss": 0.01, + "step": 1230 + }, + { + "epoch": 0.32562125107112255, + "grad_norm": 0.02197265625, + "learning_rate": 0.00016754189207019397, + "loss": 0.0139, + "step": 1235 + }, + { + "epoch": 0.3269395557313295, + "grad_norm": 0.09228515625, + "learning_rate": 0.00016740994854202404, + "loss": 0.014, + "step": 1240 + }, + { + "epoch": 0.3282578603915365, + "grad_norm": 0.47265625, + "learning_rate": 0.00016727800501385408, + "loss": 0.1546, + "step": 1245 + }, + { + "epoch": 0.3295761650517435, + "grad_norm": 0.02294921875, + "learning_rate": 0.00016714606148568413, + "loss": 0.0803, + "step": 1250 + }, + { + "epoch": 0.3308944697119504, + "grad_norm": 0.185546875, + "learning_rate": 0.00016701411795751417, + "loss": 0.0376, + "step": 1255 + }, + { + "epoch": 0.3322127743721574, + "grad_norm": 0.1123046875, + "learning_rate": 0.00016688217442934424, + "loss": 0.0375, + "step": 1260 + }, + { + "epoch": 0.3335310790323644, + "grad_norm": 1.03125, + "learning_rate": 0.0001667502309011743, + "loss": 0.0442, + "step": 1265 + }, + { + "epoch": 0.33484938369257133, + "grad_norm": 0.0172119140625, + "learning_rate": 0.00016661828737300435, + "loss": 0.0261, + "step": 1270 + }, + { + "epoch": 0.3361676883527783, + "grad_norm": 0.42578125, + "learning_rate": 0.00016648634384483442, + "loss": 0.0553, + "step": 1275 + }, + { + "epoch": 0.3374859930129853, + "grad_norm": 0.1328125, + "learning_rate": 0.00016635440031666446, + "loss": 0.0065, + "step": 1280 + }, + { + "epoch": 0.33880429767319226, + "grad_norm": 0.263671875, + "learning_rate": 0.00016622245678849453, + "loss": 0.0527, + "step": 1285 + }, + { + "epoch": 0.34012260233339925, + "grad_norm": 0.314453125, + "learning_rate": 0.0001660905132603246, + "loss": 0.0297, + "step": 1290 + }, + { + "epoch": 0.34144090699360624, + "grad_norm": 0.04345703125, + "learning_rate": 0.00016595856973215464, + "loss": 0.0477, + "step": 1295 + }, + { + "epoch": 0.3427592116538132, + "grad_norm": 0.08154296875, + "learning_rate": 0.0001658266262039847, + "loss": 0.0298, + "step": 1300 + }, + { + "epoch": 0.34407751631402017, + "grad_norm": 0.08935546875, + "learning_rate": 0.00016569468267581475, + "loss": 0.0481, + "step": 1305 + }, + { + "epoch": 0.34539582097422716, + "grad_norm": 0.06640625, + "learning_rate": 0.00016556273914764482, + "loss": 0.0153, + "step": 1310 + }, + { + "epoch": 0.3467141256344341, + "grad_norm": 0.00592041015625, + "learning_rate": 0.00016543079561947486, + "loss": 0.0111, + "step": 1315 + }, + { + "epoch": 0.3480324302946411, + "grad_norm": 0.2236328125, + "learning_rate": 0.00016529885209130493, + "loss": 0.0309, + "step": 1320 + }, + { + "epoch": 0.3493507349548481, + "grad_norm": 0.0198974609375, + "learning_rate": 0.000165166908563135, + "loss": 0.0579, + "step": 1325 + }, + { + "epoch": 0.350669039615055, + "grad_norm": 0.10107421875, + "learning_rate": 0.00016503496503496504, + "loss": 0.0055, + "step": 1330 + }, + { + "epoch": 0.351987344275262, + "grad_norm": 0.71875, + "learning_rate": 0.00016490302150679511, + "loss": 0.0299, + "step": 1335 + }, + { + "epoch": 0.353305648935469, + "grad_norm": 0.01348876953125, + "learning_rate": 0.00016477107797862516, + "loss": 0.0943, + "step": 1340 + }, + { + "epoch": 0.35462395359567594, + "grad_norm": 0.3046875, + "learning_rate": 0.00016463913445045523, + "loss": 0.0216, + "step": 1345 + }, + { + "epoch": 0.35594225825588294, + "grad_norm": 0.02392578125, + "learning_rate": 0.00016450719092228527, + "loss": 0.0265, + "step": 1350 + }, + { + "epoch": 0.35726056291608993, + "grad_norm": 0.453125, + "learning_rate": 0.0001643752473941153, + "loss": 0.0539, + "step": 1355 + }, + { + "epoch": 0.35857886757629687, + "grad_norm": 0.00823974609375, + "learning_rate": 0.00016424330386594538, + "loss": 0.0139, + "step": 1360 + }, + { + "epoch": 0.35989717223650386, + "grad_norm": 0.55859375, + "learning_rate": 0.00016411136033777542, + "loss": 0.0428, + "step": 1365 + }, + { + "epoch": 0.36121547689671085, + "grad_norm": 0.052734375, + "learning_rate": 0.0001639794168096055, + "loss": 0.0346, + "step": 1370 + }, + { + "epoch": 0.3625337815569178, + "grad_norm": 0.12158203125, + "learning_rate": 0.00016384747328143556, + "loss": 0.0095, + "step": 1375 + }, + { + "epoch": 0.3638520862171248, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0001637155297532656, + "loss": 0.0224, + "step": 1380 + }, + { + "epoch": 0.3651703908773318, + "grad_norm": 0.01318359375, + "learning_rate": 0.00016358358622509567, + "loss": 0.0316, + "step": 1385 + }, + { + "epoch": 0.3664886955375387, + "grad_norm": 0.011962890625, + "learning_rate": 0.0001634516426969257, + "loss": 0.0051, + "step": 1390 + }, + { + "epoch": 0.3678070001977457, + "grad_norm": 0.00396728515625, + "learning_rate": 0.00016331969916875578, + "loss": 0.038, + "step": 1395 + }, + { + "epoch": 0.3691253048579527, + "grad_norm": 0.375, + "learning_rate": 0.00016318775564058585, + "loss": 0.029, + "step": 1400 + }, + { + "epoch": 0.37044360951815963, + "grad_norm": 0.265625, + "learning_rate": 0.0001630558121124159, + "loss": 0.0072, + "step": 1405 + }, + { + "epoch": 0.3717619141783666, + "grad_norm": 0.00127410888671875, + "learning_rate": 0.00016292386858424596, + "loss": 0.0381, + "step": 1410 + }, + { + "epoch": 0.3730802188385736, + "grad_norm": 1.15625, + "learning_rate": 0.000162791925056076, + "loss": 0.0573, + "step": 1415 + }, + { + "epoch": 0.37439852349878056, + "grad_norm": 0.0244140625, + "learning_rate": 0.00016265998152790607, + "loss": 0.051, + "step": 1420 + }, + { + "epoch": 0.37571682815898755, + "grad_norm": 0.0015106201171875, + "learning_rate": 0.00016252803799973612, + "loss": 0.0239, + "step": 1425 + }, + { + "epoch": 0.37703513281919454, + "grad_norm": 0.26953125, + "learning_rate": 0.00016239609447156618, + "loss": 0.0165, + "step": 1430 + }, + { + "epoch": 0.3783534374794015, + "grad_norm": 0.006134033203125, + "learning_rate": 0.00016226415094339625, + "loss": 0.0071, + "step": 1435 + }, + { + "epoch": 0.37967174213960847, + "grad_norm": 2.828125, + "learning_rate": 0.0001621322074152263, + "loss": 0.0272, + "step": 1440 + }, + { + "epoch": 0.38099004679981546, + "grad_norm": 0.349609375, + "learning_rate": 0.00016200026388705637, + "loss": 0.0647, + "step": 1445 + }, + { + "epoch": 0.3823083514600224, + "grad_norm": 0.09326171875, + "learning_rate": 0.00016186832035888638, + "loss": 0.0262, + "step": 1450 + }, + { + "epoch": 0.3836266561202294, + "grad_norm": 0.041015625, + "learning_rate": 0.00016173637683071645, + "loss": 0.0576, + "step": 1455 + }, + { + "epoch": 0.3849449607804364, + "grad_norm": 0.033935546875, + "learning_rate": 0.00016160443330254652, + "loss": 0.0142, + "step": 1460 + }, + { + "epoch": 0.3862632654406433, + "grad_norm": 0.09130859375, + "learning_rate": 0.00016147248977437656, + "loss": 0.0348, + "step": 1465 + }, + { + "epoch": 0.3875815701008503, + "grad_norm": 2.390625, + "learning_rate": 0.00016134054624620663, + "loss": 0.0672, + "step": 1470 + }, + { + "epoch": 0.3888998747610573, + "grad_norm": 0.439453125, + "learning_rate": 0.00016120860271803667, + "loss": 0.0121, + "step": 1475 + }, + { + "epoch": 0.39021817942126424, + "grad_norm": 0.1298828125, + "learning_rate": 0.00016107665918986674, + "loss": 0.0114, + "step": 1480 + }, + { + "epoch": 0.39153648408147124, + "grad_norm": 0.85546875, + "learning_rate": 0.0001609447156616968, + "loss": 0.0968, + "step": 1485 + }, + { + "epoch": 0.39285478874167823, + "grad_norm": 0.703125, + "learning_rate": 0.00016081277213352685, + "loss": 0.0349, + "step": 1490 + }, + { + "epoch": 0.39417309340188517, + "grad_norm": 0.021728515625, + "learning_rate": 0.00016068082860535692, + "loss": 0.0106, + "step": 1495 + }, + { + "epoch": 0.39549139806209216, + "grad_norm": 0.7265625, + "learning_rate": 0.00016054888507718696, + "loss": 0.0225, + "step": 1500 + }, + { + "epoch": 0.39549139806209216, + "eval_loss": 0.03515048325061798, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.3497, + "eval_samples_per_second": 7.373, + "eval_steps_per_second": 3.686, + "step": 1500 + }, + { + "epoch": 0.3968097027222991, + "grad_norm": 0.016519820317626, + "learning_rate": 0.00016041694154901703, + "loss": 0.0202, + "step": 1505 + }, + { + "epoch": 0.3981280073825061, + "grad_norm": 0.8505942225456238, + "learning_rate": 0.00016028499802084708, + "loss": 0.0541, + "step": 1510 + }, + { + "epoch": 0.3994463120427131, + "grad_norm": 0.04163295030593872, + "learning_rate": 0.00016015305449267714, + "loss": 0.0037, + "step": 1515 + }, + { + "epoch": 0.40076461670292, + "grad_norm": 0.011332935653626919, + "learning_rate": 0.00016002111096450721, + "loss": 0.0459, + "step": 1520 + }, + { + "epoch": 0.402082921363127, + "grad_norm": 0.9360129833221436, + "learning_rate": 0.00015988916743633726, + "loss": 0.013, + "step": 1525 + }, + { + "epoch": 0.403401226023334, + "grad_norm": 0.11991436779499054, + "learning_rate": 0.00015975722390816733, + "loss": 0.0079, + "step": 1530 + }, + { + "epoch": 0.40471953068354094, + "grad_norm": 0.36911076307296753, + "learning_rate": 0.00015962528037999737, + "loss": 0.0638, + "step": 1535 + }, + { + "epoch": 0.40603783534374793, + "grad_norm": 0.020278634503483772, + "learning_rate": 0.00015949333685182744, + "loss": 0.0217, + "step": 1540 + }, + { + "epoch": 0.4073561400039549, + "grad_norm": 0.14263059198856354, + "learning_rate": 0.0001593613933236575, + "loss": 0.0495, + "step": 1545 + }, + { + "epoch": 0.40867444466416186, + "grad_norm": 0.09494803845882416, + "learning_rate": 0.00015922944979548752, + "loss": 0.0248, + "step": 1550 + }, + { + "epoch": 0.40999274932436885, + "grad_norm": 0.23064319789409637, + "learning_rate": 0.0001590975062673176, + "loss": 0.0285, + "step": 1555 + }, + { + "epoch": 0.41131105398457585, + "grad_norm": 0.32220256328582764, + "learning_rate": 0.00015896556273914763, + "loss": 0.0537, + "step": 1560 + }, + { + "epoch": 0.4126293586447828, + "grad_norm": 0.41208815574645996, + "learning_rate": 0.0001588336192109777, + "loss": 0.0453, + "step": 1565 + }, + { + "epoch": 0.4139476633049898, + "grad_norm": 0.03775424137711525, + "learning_rate": 0.00015870167568280777, + "loss": 0.0134, + "step": 1570 + }, + { + "epoch": 0.41526596796519677, + "grad_norm": 0.6526333093643188, + "learning_rate": 0.0001585697321546378, + "loss": 0.0329, + "step": 1575 + }, + { + "epoch": 0.4165842726254037, + "grad_norm": 1.001305103302002, + "learning_rate": 0.00015843778862646788, + "loss": 0.0912, + "step": 1580 + }, + { + "epoch": 0.4179025772856107, + "grad_norm": 0.4055219888687134, + "learning_rate": 0.00015830584509829792, + "loss": 0.0519, + "step": 1585 + }, + { + "epoch": 0.4192208819458177, + "grad_norm": 0.035015616565942764, + "learning_rate": 0.000158173901570128, + "loss": 0.0191, + "step": 1590 + }, + { + "epoch": 0.42053918660602463, + "grad_norm": 0.09326844662427902, + "learning_rate": 0.00015804195804195806, + "loss": 0.0106, + "step": 1595 + }, + { + "epoch": 0.4218574912662316, + "grad_norm": 0.06223440542817116, + "learning_rate": 0.0001579100145137881, + "loss": 0.0113, + "step": 1600 + }, + { + "epoch": 0.4231757959264386, + "grad_norm": 0.0625135526061058, + "learning_rate": 0.00015777807098561817, + "loss": 0.0191, + "step": 1605 + }, + { + "epoch": 0.42449410058664555, + "grad_norm": 0.2645983099937439, + "learning_rate": 0.00015764612745744822, + "loss": 0.0829, + "step": 1610 + }, + { + "epoch": 0.42581240524685254, + "grad_norm": 0.009632415138185024, + "learning_rate": 0.00015751418392927829, + "loss": 0.0542, + "step": 1615 + }, + { + "epoch": 0.42713070990705954, + "grad_norm": 0.01979319378733635, + "learning_rate": 0.00015738224040110833, + "loss": 0.0517, + "step": 1620 + }, + { + "epoch": 0.4284490145672665, + "grad_norm": 0.3065454065799713, + "learning_rate": 0.0001572502968729384, + "loss": 0.0738, + "step": 1625 + }, + { + "epoch": 0.42976731922747347, + "grad_norm": 0.09581473469734192, + "learning_rate": 0.00015711835334476847, + "loss": 0.0571, + "step": 1630 + }, + { + "epoch": 0.43108562388768046, + "grad_norm": 0.23746591806411743, + "learning_rate": 0.0001569864098165985, + "loss": 0.0128, + "step": 1635 + }, + { + "epoch": 0.4324039285478874, + "grad_norm": 0.936278760433197, + "learning_rate": 0.00015685446628842858, + "loss": 0.0665, + "step": 1640 + }, + { + "epoch": 0.4337222332080944, + "grad_norm": 0.18487441539764404, + "learning_rate": 0.00015672252276025862, + "loss": 0.0527, + "step": 1645 + }, + { + "epoch": 0.4350405378683014, + "grad_norm": 0.6980624794960022, + "learning_rate": 0.00015659057923208866, + "loss": 0.0613, + "step": 1650 + }, + { + "epoch": 0.4363588425285083, + "grad_norm": 0.4696301221847534, + "learning_rate": 0.00015645863570391873, + "loss": 0.0569, + "step": 1655 + }, + { + "epoch": 0.4376771471887153, + "grad_norm": 0.15083105862140656, + "learning_rate": 0.00015632669217574877, + "loss": 0.0394, + "step": 1660 + }, + { + "epoch": 0.4389954518489223, + "grad_norm": 0.44701239466667175, + "learning_rate": 0.00015619474864757884, + "loss": 0.0494, + "step": 1665 + }, + { + "epoch": 0.44031375650912924, + "grad_norm": 0.07418403029441833, + "learning_rate": 0.00015606280511940888, + "loss": 0.0291, + "step": 1670 + }, + { + "epoch": 0.44163206116933623, + "grad_norm": 0.02311861515045166, + "learning_rate": 0.00015593086159123895, + "loss": 0.0304, + "step": 1675 + }, + { + "epoch": 0.4429503658295432, + "grad_norm": 0.4416038990020752, + "learning_rate": 0.00015579891806306902, + "loss": 0.0176, + "step": 1680 + }, + { + "epoch": 0.44426867048975016, + "grad_norm": 0.5124915242195129, + "learning_rate": 0.00015566697453489906, + "loss": 0.0454, + "step": 1685 + }, + { + "epoch": 0.44558697514995715, + "grad_norm": 0.3159286081790924, + "learning_rate": 0.00015553503100672913, + "loss": 0.047, + "step": 1690 + }, + { + "epoch": 0.44690527981016415, + "grad_norm": 0.032126396894454956, + "learning_rate": 0.00015540308747855918, + "loss": 0.0151, + "step": 1695 + }, + { + "epoch": 0.4482235844703711, + "grad_norm": 0.04663548618555069, + "learning_rate": 0.00015527114395038924, + "loss": 0.0375, + "step": 1700 + }, + { + "epoch": 0.4495418891305781, + "grad_norm": 0.013753900304436684, + "learning_rate": 0.0001551392004222193, + "loss": 0.0485, + "step": 1705 + }, + { + "epoch": 0.45086019379078507, + "grad_norm": 1.9952393770217896, + "learning_rate": 0.00015500725689404936, + "loss": 0.0625, + "step": 1710 + }, + { + "epoch": 0.452178498450992, + "grad_norm": 0.014283270575106144, + "learning_rate": 0.00015487531336587943, + "loss": 0.0037, + "step": 1715 + }, + { + "epoch": 0.453496803111199, + "grad_norm": 0.3897913098335266, + "learning_rate": 0.00015474336983770947, + "loss": 0.0304, + "step": 1720 + }, + { + "epoch": 0.454815107771406, + "grad_norm": 0.3730885684490204, + "learning_rate": 0.00015461142630953954, + "loss": 0.0115, + "step": 1725 + }, + { + "epoch": 0.45613341243161293, + "grad_norm": 0.035858724266290665, + "learning_rate": 0.00015447948278136958, + "loss": 0.0021, + "step": 1730 + }, + { + "epoch": 0.4574517170918199, + "grad_norm": 0.20589517056941986, + "learning_rate": 0.00015434753925319965, + "loss": 0.0132, + "step": 1735 + }, + { + "epoch": 0.4587700217520269, + "grad_norm": 0.004939342383295298, + "learning_rate": 0.00015421559572502972, + "loss": 0.0471, + "step": 1740 + }, + { + "epoch": 0.46008832641223385, + "grad_norm": 0.03493283689022064, + "learning_rate": 0.00015408365219685976, + "loss": 0.0062, + "step": 1745 + }, + { + "epoch": 0.46140663107244084, + "grad_norm": 0.045927103608846664, + "learning_rate": 0.0001539517086686898, + "loss": 0.0283, + "step": 1750 + }, + { + "epoch": 0.46272493573264784, + "grad_norm": 0.012629454955458641, + "learning_rate": 0.00015381976514051984, + "loss": 0.0133, + "step": 1755 + }, + { + "epoch": 0.46404324039285477, + "grad_norm": 0.8001697659492493, + "learning_rate": 0.0001536878216123499, + "loss": 0.0224, + "step": 1760 + }, + { + "epoch": 0.46536154505306176, + "grad_norm": 0.002036362886428833, + "learning_rate": 0.00015355587808417998, + "loss": 0.0066, + "step": 1765 + }, + { + "epoch": 0.46667984971326876, + "grad_norm": 1.0261330604553223, + "learning_rate": 0.00015342393455601002, + "loss": 0.191, + "step": 1770 + }, + { + "epoch": 0.4679981543734757, + "grad_norm": 0.3033429682254791, + "learning_rate": 0.0001532919910278401, + "loss": 0.0222, + "step": 1775 + }, + { + "epoch": 0.4693164590336827, + "grad_norm": 0.36911338567733765, + "learning_rate": 0.00015316004749967014, + "loss": 0.0363, + "step": 1780 + }, + { + "epoch": 0.4706347636938897, + "grad_norm": 0.0406811460852623, + "learning_rate": 0.0001530281039715002, + "loss": 0.0283, + "step": 1785 + }, + { + "epoch": 0.4719530683540966, + "grad_norm": 0.23334211111068726, + "learning_rate": 0.00015289616044333027, + "loss": 0.0274, + "step": 1790 + }, + { + "epoch": 0.4732713730143036, + "grad_norm": 0.013081169687211514, + "learning_rate": 0.00015276421691516032, + "loss": 0.0221, + "step": 1795 + }, + { + "epoch": 0.4745896776745106, + "grad_norm": 0.2480790615081787, + "learning_rate": 0.00015263227338699039, + "loss": 0.019, + "step": 1800 + }, + { + "epoch": 0.47590798233471754, + "grad_norm": 0.0373196005821228, + "learning_rate": 0.00015250032985882043, + "loss": 0.0292, + "step": 1805 + }, + { + "epoch": 0.47722628699492453, + "grad_norm": 0.004609994124621153, + "learning_rate": 0.0001523683863306505, + "loss": 0.0918, + "step": 1810 + }, + { + "epoch": 0.4785445916551315, + "grad_norm": 0.02370987832546234, + "learning_rate": 0.00015223644280248054, + "loss": 0.0462, + "step": 1815 + }, + { + "epoch": 0.47986289631533846, + "grad_norm": 0.05842221528291702, + "learning_rate": 0.0001521044992743106, + "loss": 0.0595, + "step": 1820 + }, + { + "epoch": 0.48118120097554545, + "grad_norm": 0.009685276076197624, + "learning_rate": 0.00015197255574614068, + "loss": 0.0074, + "step": 1825 + }, + { + "epoch": 0.48249950563575245, + "grad_norm": 0.8933250308036804, + "learning_rate": 0.00015184061221797072, + "loss": 0.0757, + "step": 1830 + }, + { + "epoch": 0.4838178102959594, + "grad_norm": 0.07075401395559311, + "learning_rate": 0.0001517086686898008, + "loss": 0.0226, + "step": 1835 + }, + { + "epoch": 0.4851361149561664, + "grad_norm": 0.732706606388092, + "learning_rate": 0.00015157672516163083, + "loss": 0.0161, + "step": 1840 + }, + { + "epoch": 0.48645441961637337, + "grad_norm": 1.1897023916244507, + "learning_rate": 0.0001514447816334609, + "loss": 0.0265, + "step": 1845 + }, + { + "epoch": 0.4877727242765803, + "grad_norm": 0.052572328597307205, + "learning_rate": 0.00015131283810529094, + "loss": 0.0094, + "step": 1850 + }, + { + "epoch": 0.4890910289367873, + "grad_norm": 0.08263898640871048, + "learning_rate": 0.00015118089457712098, + "loss": 0.0631, + "step": 1855 + }, + { + "epoch": 0.4904093335969943, + "grad_norm": 0.03225664421916008, + "learning_rate": 0.00015104895104895105, + "loss": 0.023, + "step": 1860 + }, + { + "epoch": 0.4917276382572012, + "grad_norm": 0.007935039699077606, + "learning_rate": 0.0001509170075207811, + "loss": 0.0039, + "step": 1865 + }, + { + "epoch": 0.4930459429174082, + "grad_norm": 0.00830796267837286, + "learning_rate": 0.00015078506399261116, + "loss": 0.007, + "step": 1870 + }, + { + "epoch": 0.4943642475776152, + "grad_norm": 0.08042234182357788, + "learning_rate": 0.00015065312046444123, + "loss": 0.0366, + "step": 1875 + }, + { + "epoch": 0.49568255223782215, + "grad_norm": 0.009092851541936398, + "learning_rate": 0.00015052117693627128, + "loss": 0.0107, + "step": 1880 + }, + { + "epoch": 0.49700085689802914, + "grad_norm": 0.2674141824245453, + "learning_rate": 0.00015038923340810135, + "loss": 0.0076, + "step": 1885 + }, + { + "epoch": 0.49831916155823613, + "grad_norm": 0.07694366574287415, + "learning_rate": 0.0001502572898799314, + "loss": 0.0252, + "step": 1890 + }, + { + "epoch": 0.49963746621844307, + "grad_norm": 0.5699467062950134, + "learning_rate": 0.00015012534635176146, + "loss": 0.0487, + "step": 1895 + }, + { + "epoch": 0.5009557708786501, + "grad_norm": 0.18800878524780273, + "learning_rate": 0.0001499934028235915, + "loss": 0.0183, + "step": 1900 + }, + { + "epoch": 0.5022740755388571, + "grad_norm": 0.019469989463686943, + "learning_rate": 0.00014986145929542157, + "loss": 0.0268, + "step": 1905 + }, + { + "epoch": 0.503592380199064, + "grad_norm": 0.01890506222844124, + "learning_rate": 0.00014972951576725164, + "loss": 0.0449, + "step": 1910 + }, + { + "epoch": 0.5049106848592709, + "grad_norm": 0.0006314461352303624, + "learning_rate": 0.00014959757223908168, + "loss": 0.0056, + "step": 1915 + }, + { + "epoch": 0.5062289895194779, + "grad_norm": 0.32654041051864624, + "learning_rate": 0.00014946562871091175, + "loss": 0.0256, + "step": 1920 + }, + { + "epoch": 0.5075472941796849, + "grad_norm": 0.7803483605384827, + "learning_rate": 0.0001493336851827418, + "loss": 0.0374, + "step": 1925 + }, + { + "epoch": 0.5088655988398919, + "grad_norm": 0.028441445901989937, + "learning_rate": 0.00014920174165457186, + "loss": 0.0161, + "step": 1930 + }, + { + "epoch": 0.5101839035000989, + "grad_norm": 0.028379200026392937, + "learning_rate": 0.00014906979812640193, + "loss": 0.0151, + "step": 1935 + }, + { + "epoch": 0.5115022081603059, + "grad_norm": 0.021159596741199493, + "learning_rate": 0.00014893785459823197, + "loss": 0.0303, + "step": 1940 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.24903325736522675, + "learning_rate": 0.000148805911070062, + "loss": 0.0076, + "step": 1945 + }, + { + "epoch": 0.5141388174807198, + "grad_norm": 0.007065301761031151, + "learning_rate": 0.00014867396754189206, + "loss": 0.022, + "step": 1950 + }, + { + "epoch": 0.5154571221409268, + "grad_norm": 0.004032329190522432, + "learning_rate": 0.00014854202401372212, + "loss": 0.0083, + "step": 1955 + }, + { + "epoch": 0.5167754268011338, + "grad_norm": 0.3045775592327118, + "learning_rate": 0.0001484100804855522, + "loss": 0.0113, + "step": 1960 + }, + { + "epoch": 0.5180937314613407, + "grad_norm": 0.36974939703941345, + "learning_rate": 0.00014827813695738224, + "loss": 0.0267, + "step": 1965 + }, + { + "epoch": 0.5194120361215477, + "grad_norm": 0.009729950688779354, + "learning_rate": 0.0001481461934292123, + "loss": 0.027, + "step": 1970 + }, + { + "epoch": 0.5207303407817546, + "grad_norm": 0.0013097926275804639, + "learning_rate": 0.00014801424990104235, + "loss": 0.003, + "step": 1975 + }, + { + "epoch": 0.5220486454419616, + "grad_norm": 0.0706263929605484, + "learning_rate": 0.00014788230637287242, + "loss": 0.0193, + "step": 1980 + }, + { + "epoch": 0.5233669501021686, + "grad_norm": 1.435702919960022, + "learning_rate": 0.00014775036284470249, + "loss": 0.0647, + "step": 1985 + }, + { + "epoch": 0.5246852547623756, + "grad_norm": 0.00661757867783308, + "learning_rate": 0.00014761841931653253, + "loss": 0.0373, + "step": 1990 + }, + { + "epoch": 0.5260035594225826, + "grad_norm": 0.12014541029930115, + "learning_rate": 0.0001474864757883626, + "loss": 0.0178, + "step": 1995 + }, + { + "epoch": 0.5273218640827896, + "grad_norm": 1.0549248456954956, + "learning_rate": 0.00014735453226019264, + "loss": 0.0191, + "step": 2000 + }, + { + "epoch": 0.5273218640827896, + "eval_loss": 0.037292081862688065, + "eval_runtime": 454.3033, + "eval_samples_per_second": 7.422, + "eval_steps_per_second": 3.711, + "step": 2000 + }, + { + "epoch": 0.5286401687429965, + "grad_norm": 0.47634151577949524, + "learning_rate": 0.0001472225887320227, + "loss": 0.0404, + "step": 2005 + }, + { + "epoch": 0.5299584734032035, + "grad_norm": 0.006752463988959789, + "learning_rate": 0.00014709064520385275, + "loss": 0.034, + "step": 2010 + }, + { + "epoch": 0.5312767780634104, + "grad_norm": 0.20780125260353088, + "learning_rate": 0.00014695870167568282, + "loss": 0.0421, + "step": 2015 + }, + { + "epoch": 0.5325950827236174, + "grad_norm": 0.010941066779196262, + "learning_rate": 0.0001468267581475129, + "loss": 0.0086, + "step": 2020 + }, + { + "epoch": 0.5339133873838244, + "grad_norm": 0.3439581096172333, + "learning_rate": 0.00014669481461934293, + "loss": 0.0187, + "step": 2025 + }, + { + "epoch": 0.5352316920440314, + "grad_norm": 0.14961636066436768, + "learning_rate": 0.000146562871091173, + "loss": 0.0504, + "step": 2030 + }, + { + "epoch": 0.5365499967042383, + "grad_norm": 0.0044641937129199505, + "learning_rate": 0.00014643092756300304, + "loss": 0.0134, + "step": 2035 + }, + { + "epoch": 0.5378683013644453, + "grad_norm": 0.14088386297225952, + "learning_rate": 0.0001462989840348331, + "loss": 0.0096, + "step": 2040 + }, + { + "epoch": 0.5391866060246523, + "grad_norm": 0.48116979002952576, + "learning_rate": 0.00014616704050666315, + "loss": 0.0124, + "step": 2045 + }, + { + "epoch": 0.5405049106848593, + "grad_norm": 0.3688766360282898, + "learning_rate": 0.0001460350969784932, + "loss": 0.0226, + "step": 2050 + }, + { + "epoch": 0.5418232153450663, + "grad_norm": 0.002938181860372424, + "learning_rate": 0.00014590315345032326, + "loss": 0.0267, + "step": 2055 + }, + { + "epoch": 0.5431415200052733, + "grad_norm": 0.3335214853286743, + "learning_rate": 0.0001457712099221533, + "loss": 0.0367, + "step": 2060 + }, + { + "epoch": 0.5444598246654802, + "grad_norm": 0.004644686821848154, + "learning_rate": 0.00014563926639398338, + "loss": 0.0121, + "step": 2065 + }, + { + "epoch": 0.5457781293256871, + "grad_norm": 0.19505545496940613, + "learning_rate": 0.00014550732286581345, + "loss": 0.0591, + "step": 2070 + }, + { + "epoch": 0.5470964339858941, + "grad_norm": 0.018028756603598595, + "learning_rate": 0.0001453753793376435, + "loss": 0.0131, + "step": 2075 + }, + { + "epoch": 0.5484147386461011, + "grad_norm": 0.045639291405677795, + "learning_rate": 0.00014524343580947356, + "loss": 0.0443, + "step": 2080 + }, + { + "epoch": 0.5497330433063081, + "grad_norm": 0.727981686592102, + "learning_rate": 0.0001451114922813036, + "loss": 0.0205, + "step": 2085 + }, + { + "epoch": 0.5510513479665151, + "grad_norm": 0.03766491636633873, + "learning_rate": 0.00014497954875313367, + "loss": 0.0067, + "step": 2090 + }, + { + "epoch": 0.552369652626722, + "grad_norm": 0.1911504715681076, + "learning_rate": 0.0001448476052249637, + "loss": 0.0397, + "step": 2095 + }, + { + "epoch": 0.553687957286929, + "grad_norm": 0.08238353580236435, + "learning_rate": 0.00014471566169679378, + "loss": 0.0513, + "step": 2100 + }, + { + "epoch": 0.555006261947136, + "grad_norm": 0.06317206472158432, + "learning_rate": 0.00014458371816862385, + "loss": 0.0178, + "step": 2105 + }, + { + "epoch": 0.556324566607343, + "grad_norm": 0.0652734637260437, + "learning_rate": 0.0001444517746404539, + "loss": 0.0184, + "step": 2110 + }, + { + "epoch": 0.55764287126755, + "grad_norm": 0.05471858009696007, + "learning_rate": 0.00014431983111228396, + "loss": 0.0089, + "step": 2115 + }, + { + "epoch": 0.558961175927757, + "grad_norm": 0.005062670446932316, + "learning_rate": 0.000144187887584114, + "loss": 0.0052, + "step": 2120 + }, + { + "epoch": 0.5602794805879638, + "grad_norm": 0.06337414681911469, + "learning_rate": 0.00014405594405594407, + "loss": 0.053, + "step": 2125 + }, + { + "epoch": 0.5615977852481708, + "grad_norm": 0.33745357394218445, + "learning_rate": 0.00014392400052777414, + "loss": 0.0166, + "step": 2130 + }, + { + "epoch": 0.5629160899083778, + "grad_norm": 0.7382741570472717, + "learning_rate": 0.00014379205699960418, + "loss": 0.0191, + "step": 2135 + }, + { + "epoch": 0.5642343945685848, + "grad_norm": 0.007551972754299641, + "learning_rate": 0.00014366011347143425, + "loss": 0.0022, + "step": 2140 + }, + { + "epoch": 0.5655526992287918, + "grad_norm": 0.6260896921157837, + "learning_rate": 0.00014352816994326427, + "loss": 0.0095, + "step": 2145 + }, + { + "epoch": 0.5668710038889987, + "grad_norm": 0.11619322001934052, + "learning_rate": 0.00014339622641509434, + "loss": 0.015, + "step": 2150 + }, + { + "epoch": 0.5681893085492057, + "grad_norm": 1.1440670490264893, + "learning_rate": 0.0001432642828869244, + "loss": 0.1343, + "step": 2155 + }, + { + "epoch": 0.5695076132094127, + "grad_norm": 1.1793878078460693, + "learning_rate": 0.00014313233935875445, + "loss": 0.0968, + "step": 2160 + }, + { + "epoch": 0.5708259178696197, + "grad_norm": 0.6865736842155457, + "learning_rate": 0.00014300039583058452, + "loss": 0.0195, + "step": 2165 + }, + { + "epoch": 0.5721442225298267, + "grad_norm": 0.140816792845726, + "learning_rate": 0.00014286845230241456, + "loss": 0.0761, + "step": 2170 + }, + { + "epoch": 0.5734625271900337, + "grad_norm": 0.04071786254644394, + "learning_rate": 0.00014273650877424463, + "loss": 0.0193, + "step": 2175 + }, + { + "epoch": 0.5747808318502405, + "grad_norm": 0.044617727398872375, + "learning_rate": 0.0001426045652460747, + "loss": 0.0112, + "step": 2180 + }, + { + "epoch": 0.5760991365104475, + "grad_norm": 0.11001799255609512, + "learning_rate": 0.00014247262171790474, + "loss": 0.0039, + "step": 2185 + }, + { + "epoch": 0.5774174411706545, + "grad_norm": 0.0036315324250608683, + "learning_rate": 0.0001423406781897348, + "loss": 0.0038, + "step": 2190 + }, + { + "epoch": 0.5787357458308615, + "grad_norm": 0.9866570830345154, + "learning_rate": 0.00014220873466156485, + "loss": 0.025, + "step": 2195 + }, + { + "epoch": 0.5800540504910685, + "grad_norm": 0.023570384830236435, + "learning_rate": 0.00014207679113339492, + "loss": 0.0468, + "step": 2200 + }, + { + "epoch": 0.5813723551512755, + "grad_norm": 0.20010559260845184, + "learning_rate": 0.00014194484760522496, + "loss": 0.0198, + "step": 2205 + }, + { + "epoch": 0.5826906598114824, + "grad_norm": 0.06153270602226257, + "learning_rate": 0.00014181290407705503, + "loss": 0.0764, + "step": 2210 + }, + { + "epoch": 0.5840089644716894, + "grad_norm": 0.033162448555231094, + "learning_rate": 0.0001416809605488851, + "loss": 0.028, + "step": 2215 + }, + { + "epoch": 0.5853272691318964, + "grad_norm": 0.428382933139801, + "learning_rate": 0.00014154901702071514, + "loss": 0.0652, + "step": 2220 + }, + { + "epoch": 0.5866455737921034, + "grad_norm": 0.25004762411117554, + "learning_rate": 0.0001414170734925452, + "loss": 0.0411, + "step": 2225 + }, + { + "epoch": 0.5879638784523104, + "grad_norm": 0.22649863362312317, + "learning_rate": 0.00014128512996437525, + "loss": 0.0517, + "step": 2230 + }, + { + "epoch": 0.5892821831125173, + "grad_norm": 0.035932112485170364, + "learning_rate": 0.00014115318643620532, + "loss": 0.015, + "step": 2235 + }, + { + "epoch": 0.5906004877727242, + "grad_norm": 0.3800172507762909, + "learning_rate": 0.00014102124290803536, + "loss": 0.0324, + "step": 2240 + }, + { + "epoch": 0.5919187924329312, + "grad_norm": 0.6974118947982788, + "learning_rate": 0.0001408892993798654, + "loss": 0.0216, + "step": 2245 + }, + { + "epoch": 0.5932370970931382, + "grad_norm": 0.15472032129764557, + "learning_rate": 0.00014075735585169548, + "loss": 0.0164, + "step": 2250 + }, + { + "epoch": 0.5945554017533452, + "grad_norm": 0.015000814571976662, + "learning_rate": 0.00014062541232352552, + "loss": 0.0395, + "step": 2255 + }, + { + "epoch": 0.5958737064135522, + "grad_norm": 0.052086081355810165, + "learning_rate": 0.0001404934687953556, + "loss": 0.0032, + "step": 2260 + }, + { + "epoch": 0.5971920110737592, + "grad_norm": 0.004600350745022297, + "learning_rate": 0.00014036152526718566, + "loss": 0.0056, + "step": 2265 + }, + { + "epoch": 0.5985103157339661, + "grad_norm": 0.4940958321094513, + "learning_rate": 0.0001402295817390157, + "loss": 0.0206, + "step": 2270 + }, + { + "epoch": 0.5998286203941731, + "grad_norm": 0.09658394008874893, + "learning_rate": 0.00014009763821084577, + "loss": 0.0052, + "step": 2275 + }, + { + "epoch": 0.60114692505438, + "grad_norm": 0.00020539117394946516, + "learning_rate": 0.0001399656946826758, + "loss": 0.087, + "step": 2280 + }, + { + "epoch": 0.602465229714587, + "grad_norm": 0.1871018409729004, + "learning_rate": 0.00013983375115450588, + "loss": 0.0812, + "step": 2285 + }, + { + "epoch": 0.603783534374794, + "grad_norm": 0.02583954855799675, + "learning_rate": 0.00013970180762633592, + "loss": 0.0232, + "step": 2290 + }, + { + "epoch": 0.605101839035001, + "grad_norm": 1.2103784084320068, + "learning_rate": 0.000139569864098166, + "loss": 0.0151, + "step": 2295 + }, + { + "epoch": 0.6064201436952079, + "grad_norm": 0.023514943197369576, + "learning_rate": 0.00013943792056999606, + "loss": 0.0193, + "step": 2300 + }, + { + "epoch": 0.6077384483554149, + "grad_norm": 0.0076395305804908276, + "learning_rate": 0.0001393059770418261, + "loss": 0.0379, + "step": 2305 + }, + { + "epoch": 0.6090567530156219, + "grad_norm": 0.12412039190530777, + "learning_rate": 0.00013917403351365617, + "loss": 0.0095, + "step": 2310 + }, + { + "epoch": 0.6103750576758289, + "grad_norm": 0.021904783323407173, + "learning_rate": 0.0001390420899854862, + "loss": 0.0166, + "step": 2315 + }, + { + "epoch": 0.6116933623360359, + "grad_norm": 0.004012851510196924, + "learning_rate": 0.00013891014645731628, + "loss": 0.0103, + "step": 2320 + }, + { + "epoch": 0.6130116669962429, + "grad_norm": 0.007267913781106472, + "learning_rate": 0.00013877820292914635, + "loss": 0.0708, + "step": 2325 + }, + { + "epoch": 0.6143299716564498, + "grad_norm": 0.10363642126321793, + "learning_rate": 0.0001386462594009764, + "loss": 0.0473, + "step": 2330 + }, + { + "epoch": 0.6156482763166568, + "grad_norm": 0.04899830371141434, + "learning_rate": 0.00013851431587280646, + "loss": 0.0283, + "step": 2335 + }, + { + "epoch": 0.6169665809768637, + "grad_norm": 0.39460498094558716, + "learning_rate": 0.0001383823723446365, + "loss": 0.0597, + "step": 2340 + }, + { + "epoch": 0.6182848856370707, + "grad_norm": 0.04092290997505188, + "learning_rate": 0.00013825042881646655, + "loss": 0.0167, + "step": 2345 + }, + { + "epoch": 0.6196031902972777, + "grad_norm": 0.2781132161617279, + "learning_rate": 0.00013811848528829662, + "loss": 0.0097, + "step": 2350 + }, + { + "epoch": 0.6209214949574847, + "grad_norm": 0.041443537920713425, + "learning_rate": 0.00013798654176012666, + "loss": 0.0226, + "step": 2355 + }, + { + "epoch": 0.6222397996176916, + "grad_norm": 0.1242462694644928, + "learning_rate": 0.00013785459823195673, + "loss": 0.0055, + "step": 2360 + }, + { + "epoch": 0.6235581042778986, + "grad_norm": 0.4440467357635498, + "learning_rate": 0.00013772265470378677, + "loss": 0.049, + "step": 2365 + }, + { + "epoch": 0.6248764089381056, + "grad_norm": 0.014354427345097065, + "learning_rate": 0.00013759071117561684, + "loss": 0.0327, + "step": 2370 + }, + { + "epoch": 0.6261947135983126, + "grad_norm": 0.011539973318576813, + "learning_rate": 0.0001374587676474469, + "loss": 0.0222, + "step": 2375 + }, + { + "epoch": 0.6275130182585196, + "grad_norm": 0.23539051413536072, + "learning_rate": 0.00013732682411927695, + "loss": 0.0816, + "step": 2380 + }, + { + "epoch": 0.6288313229187266, + "grad_norm": 0.26793941855430603, + "learning_rate": 0.00013719488059110702, + "loss": 0.0325, + "step": 2385 + }, + { + "epoch": 0.6301496275789334, + "grad_norm": 0.01662217453122139, + "learning_rate": 0.00013706293706293706, + "loss": 0.0221, + "step": 2390 + }, + { + "epoch": 0.6314679322391404, + "grad_norm": 0.30669671297073364, + "learning_rate": 0.00013693099353476713, + "loss": 0.026, + "step": 2395 + }, + { + "epoch": 0.6327862368993474, + "grad_norm": 0.03350894898176193, + "learning_rate": 0.00013679905000659717, + "loss": 0.0072, + "step": 2400 + }, + { + "epoch": 0.6341045415595544, + "grad_norm": 0.014983875676989555, + "learning_rate": 0.00013666710647842724, + "loss": 0.049, + "step": 2405 + }, + { + "epoch": 0.6354228462197614, + "grad_norm": 1.8989384174346924, + "learning_rate": 0.0001365351629502573, + "loss": 0.0335, + "step": 2410 + }, + { + "epoch": 0.6367411508799684, + "grad_norm": 0.030135562643408775, + "learning_rate": 0.00013640321942208735, + "loss": 0.0051, + "step": 2415 + }, + { + "epoch": 0.6380594555401753, + "grad_norm": 0.02079075388610363, + "learning_rate": 0.00013627127589391742, + "loss": 0.0138, + "step": 2420 + }, + { + "epoch": 0.6393777602003823, + "grad_norm": 0.06065403297543526, + "learning_rate": 0.00013613933236574746, + "loss": 0.0357, + "step": 2425 + }, + { + "epoch": 0.6406960648605893, + "grad_norm": 0.2980937659740448, + "learning_rate": 0.00013600738883757753, + "loss": 0.0138, + "step": 2430 + }, + { + "epoch": 0.6420143695207963, + "grad_norm": 0.4820438623428345, + "learning_rate": 0.00013587544530940758, + "loss": 0.01, + "step": 2435 + }, + { + "epoch": 0.6433326741810033, + "grad_norm": 0.005618259310722351, + "learning_rate": 0.00013574350178123765, + "loss": 0.0052, + "step": 2440 + }, + { + "epoch": 0.6446509788412103, + "grad_norm": 0.7173821926116943, + "learning_rate": 0.0001356115582530677, + "loss": 0.0133, + "step": 2445 + }, + { + "epoch": 0.6459692835014171, + "grad_norm": 0.0053142281249165535, + "learning_rate": 0.00013547961472489773, + "loss": 0.0045, + "step": 2450 + }, + { + "epoch": 0.6472875881616241, + "grad_norm": 0.06118829548358917, + "learning_rate": 0.0001353476711967278, + "loss": 0.056, + "step": 2455 + }, + { + "epoch": 0.6486058928218311, + "grad_norm": 3.5878078937530518, + "learning_rate": 0.00013521572766855787, + "loss": 0.0232, + "step": 2460 + }, + { + "epoch": 0.6499241974820381, + "grad_norm": 0.004911276511847973, + "learning_rate": 0.0001350837841403879, + "loss": 0.0074, + "step": 2465 + }, + { + "epoch": 0.6512425021422451, + "grad_norm": 0.0028026222717016935, + "learning_rate": 0.00013495184061221798, + "loss": 0.0782, + "step": 2470 + }, + { + "epoch": 0.6525608068024521, + "grad_norm": 0.7317615747451782, + "learning_rate": 0.00013481989708404802, + "loss": 0.0222, + "step": 2475 + }, + { + "epoch": 0.653879111462659, + "grad_norm": 0.01835751160979271, + "learning_rate": 0.0001346879535558781, + "loss": 0.0661, + "step": 2480 + }, + { + "epoch": 0.655197416122866, + "grad_norm": 0.03598962351679802, + "learning_rate": 0.00013455601002770813, + "loss": 0.0395, + "step": 2485 + }, + { + "epoch": 0.656515720783073, + "grad_norm": 0.013886351138353348, + "learning_rate": 0.0001344240664995382, + "loss": 0.0156, + "step": 2490 + }, + { + "epoch": 0.65783402544328, + "grad_norm": 5.741530895233154, + "learning_rate": 0.00013429212297136827, + "loss": 0.0317, + "step": 2495 + }, + { + "epoch": 0.659152330103487, + "grad_norm": 0.20793496072292328, + "learning_rate": 0.0001341601794431983, + "loss": 0.0072, + "step": 2500 + }, + { + "epoch": 0.659152330103487, + "eval_loss": 0.0300898440182209, + "eval_runtime": 453.0554, + "eval_samples_per_second": 7.443, + "eval_steps_per_second": 3.721, + "step": 2500 + }, + { + "epoch": 0.6604706347636939, + "grad_norm": 0.03460961952805519, + "learning_rate": 0.00013402823591502838, + "loss": 0.0097, + "step": 2505 + }, + { + "epoch": 0.6617889394239008, + "grad_norm": 0.31785696744918823, + "learning_rate": 0.00013389629238685842, + "loss": 0.0303, + "step": 2510 + }, + { + "epoch": 0.6631072440841078, + "grad_norm": 0.4273851215839386, + "learning_rate": 0.0001337643488586885, + "loss": 0.0499, + "step": 2515 + }, + { + "epoch": 0.6644255487443148, + "grad_norm": 0.02236153744161129, + "learning_rate": 0.00013363240533051856, + "loss": 0.0069, + "step": 2520 + }, + { + "epoch": 0.6657438534045218, + "grad_norm": 0.1592864990234375, + "learning_rate": 0.0001335004618023486, + "loss": 0.0326, + "step": 2525 + }, + { + "epoch": 0.6670621580647288, + "grad_norm": 0.029961545020341873, + "learning_rate": 0.00013336851827417867, + "loss": 0.0178, + "step": 2530 + }, + { + "epoch": 0.6683804627249358, + "grad_norm": 0.03120764158666134, + "learning_rate": 0.00013323657474600872, + "loss": 0.115, + "step": 2535 + }, + { + "epoch": 0.6696987673851427, + "grad_norm": 0.01060028001666069, + "learning_rate": 0.00013310463121783879, + "loss": 0.0036, + "step": 2540 + }, + { + "epoch": 0.6710170720453497, + "grad_norm": 0.053470809012651443, + "learning_rate": 0.00013297268768966883, + "loss": 0.0079, + "step": 2545 + }, + { + "epoch": 0.6723353767055567, + "grad_norm": 0.022777097299695015, + "learning_rate": 0.00013284074416149887, + "loss": 0.0078, + "step": 2550 + }, + { + "epoch": 0.6736536813657636, + "grad_norm": 0.0548521913588047, + "learning_rate": 0.00013270880063332894, + "loss": 0.0503, + "step": 2555 + }, + { + "epoch": 0.6749719860259706, + "grad_norm": 0.02028457075357437, + "learning_rate": 0.00013257685710515898, + "loss": 0.0096, + "step": 2560 + }, + { + "epoch": 0.6762902906861776, + "grad_norm": 0.01569107361137867, + "learning_rate": 0.00013244491357698905, + "loss": 0.008, + "step": 2565 + }, + { + "epoch": 0.6776085953463845, + "grad_norm": 0.00743742985650897, + "learning_rate": 0.00013231297004881912, + "loss": 0.005, + "step": 2570 + }, + { + "epoch": 0.6789269000065915, + "grad_norm": 0.025164416059851646, + "learning_rate": 0.00013218102652064916, + "loss": 0.018, + "step": 2575 + }, + { + "epoch": 0.6802452046667985, + "grad_norm": 0.3653188645839691, + "learning_rate": 0.00013204908299247923, + "loss": 0.0295, + "step": 2580 + }, + { + "epoch": 0.6815635093270055, + "grad_norm": 0.685422956943512, + "learning_rate": 0.00013191713946430927, + "loss": 0.0335, + "step": 2585 + }, + { + "epoch": 0.6828818139872125, + "grad_norm": 0.675740122795105, + "learning_rate": 0.00013178519593613934, + "loss": 0.0592, + "step": 2590 + }, + { + "epoch": 0.6842001186474194, + "grad_norm": 0.10513252764940262, + "learning_rate": 0.00013165325240796938, + "loss": 0.0353, + "step": 2595 + }, + { + "epoch": 0.6855184233076264, + "grad_norm": 0.43512973189353943, + "learning_rate": 0.00013152130887979945, + "loss": 0.0142, + "step": 2600 + }, + { + "epoch": 0.6868367279678333, + "grad_norm": 0.029436839744448662, + "learning_rate": 0.00013138936535162952, + "loss": 0.0042, + "step": 2605 + }, + { + "epoch": 0.6881550326280403, + "grad_norm": 0.5607122778892517, + "learning_rate": 0.00013125742182345957, + "loss": 0.0184, + "step": 2610 + }, + { + "epoch": 0.6894733372882473, + "grad_norm": 0.11365406215190887, + "learning_rate": 0.00013112547829528963, + "loss": 0.006, + "step": 2615 + }, + { + "epoch": 0.6907916419484543, + "grad_norm": 0.047227244824171066, + "learning_rate": 0.00013099353476711968, + "loss": 0.008, + "step": 2620 + }, + { + "epoch": 0.6921099466086612, + "grad_norm": 0.0005877618095837533, + "learning_rate": 0.00013086159123894975, + "loss": 0.0286, + "step": 2625 + }, + { + "epoch": 0.6934282512688682, + "grad_norm": 0.010759112425148487, + "learning_rate": 0.0001307296477107798, + "loss": 0.0062, + "step": 2630 + }, + { + "epoch": 0.6947465559290752, + "grad_norm": 0.07117745280265808, + "learning_rate": 0.00013059770418260986, + "loss": 0.0891, + "step": 2635 + }, + { + "epoch": 0.6960648605892822, + "grad_norm": 0.0639057606458664, + "learning_rate": 0.00013046576065443993, + "loss": 0.0072, + "step": 2640 + }, + { + "epoch": 0.6973831652494892, + "grad_norm": 0.027350090444087982, + "learning_rate": 0.00013033381712626994, + "loss": 0.0103, + "step": 2645 + }, + { + "epoch": 0.6987014699096962, + "grad_norm": 0.015336195938289165, + "learning_rate": 0.0001302018735981, + "loss": 0.0041, + "step": 2650 + }, + { + "epoch": 0.700019774569903, + "grad_norm": 1.0650830268859863, + "learning_rate": 0.00013006993006993008, + "loss": 0.0443, + "step": 2655 + }, + { + "epoch": 0.70133807923011, + "grad_norm": 0.019073212519288063, + "learning_rate": 0.00012993798654176012, + "loss": 0.0331, + "step": 2660 + }, + { + "epoch": 0.702656383890317, + "grad_norm": 0.10109209269285202, + "learning_rate": 0.0001298060430135902, + "loss": 0.0054, + "step": 2665 + }, + { + "epoch": 0.703974688550524, + "grad_norm": 0.03528957813978195, + "learning_rate": 0.00012967409948542023, + "loss": 0.0427, + "step": 2670 + }, + { + "epoch": 0.705292993210731, + "grad_norm": 0.03577788919210434, + "learning_rate": 0.0001295421559572503, + "loss": 0.023, + "step": 2675 + }, + { + "epoch": 0.706611297870938, + "grad_norm": 0.5576180815696716, + "learning_rate": 0.00012941021242908034, + "loss": 0.0416, + "step": 2680 + }, + { + "epoch": 0.7079296025311449, + "grad_norm": 0.017131298780441284, + "learning_rate": 0.0001292782689009104, + "loss": 0.0235, + "step": 2685 + }, + { + "epoch": 0.7092479071913519, + "grad_norm": 0.8517888784408569, + "learning_rate": 0.00012914632537274048, + "loss": 0.0168, + "step": 2690 + }, + { + "epoch": 0.7105662118515589, + "grad_norm": 0.23812156915664673, + "learning_rate": 0.00012901438184457052, + "loss": 0.0483, + "step": 2695 + }, + { + "epoch": 0.7118845165117659, + "grad_norm": 0.11746613681316376, + "learning_rate": 0.0001288824383164006, + "loss": 0.0255, + "step": 2700 + }, + { + "epoch": 0.7132028211719729, + "grad_norm": 0.20089928805828094, + "learning_rate": 0.00012875049478823064, + "loss": 0.0267, + "step": 2705 + }, + { + "epoch": 0.7145211258321799, + "grad_norm": 0.8301129937171936, + "learning_rate": 0.0001286185512600607, + "loss": 0.016, + "step": 2710 + }, + { + "epoch": 0.7158394304923867, + "grad_norm": 0.01838674768805504, + "learning_rate": 0.00012848660773189077, + "loss": 0.0229, + "step": 2715 + }, + { + "epoch": 0.7171577351525937, + "grad_norm": 0.03670337051153183, + "learning_rate": 0.00012835466420372082, + "loss": 0.038, + "step": 2720 + }, + { + "epoch": 0.7184760398128007, + "grad_norm": 0.0452633760869503, + "learning_rate": 0.00012822272067555089, + "loss": 0.0622, + "step": 2725 + }, + { + "epoch": 0.7197943444730077, + "grad_norm": 0.09503110498189926, + "learning_rate": 0.00012809077714738093, + "loss": 0.0209, + "step": 2730 + }, + { + "epoch": 0.7211126491332147, + "grad_norm": 1.0327308177947998, + "learning_rate": 0.000127958833619211, + "loss": 0.0361, + "step": 2735 + }, + { + "epoch": 0.7224309537934217, + "grad_norm": 1.0049290657043457, + "learning_rate": 0.00012782689009104104, + "loss": 0.0365, + "step": 2740 + }, + { + "epoch": 0.7237492584536286, + "grad_norm": 0.029774073511362076, + "learning_rate": 0.00012769494656287108, + "loss": 0.0257, + "step": 2745 + }, + { + "epoch": 0.7250675631138356, + "grad_norm": 0.20974040031433105, + "learning_rate": 0.00012756300303470115, + "loss": 0.0542, + "step": 2750 + }, + { + "epoch": 0.7263858677740426, + "grad_norm": 0.8153854608535767, + "learning_rate": 0.0001274310595065312, + "loss": 0.0216, + "step": 2755 + }, + { + "epoch": 0.7277041724342496, + "grad_norm": 0.4393698573112488, + "learning_rate": 0.00012729911597836126, + "loss": 0.0451, + "step": 2760 + }, + { + "epoch": 0.7290224770944566, + "grad_norm": 0.06990349292755127, + "learning_rate": 0.00012716717245019133, + "loss": 0.03, + "step": 2765 + }, + { + "epoch": 0.7303407817546635, + "grad_norm": 0.32689470052719116, + "learning_rate": 0.00012703522892202137, + "loss": 0.0263, + "step": 2770 + }, + { + "epoch": 0.7316590864148704, + "grad_norm": 0.026600876823067665, + "learning_rate": 0.00012690328539385144, + "loss": 0.0404, + "step": 2775 + }, + { + "epoch": 0.7329773910750774, + "grad_norm": 0.11228257417678833, + "learning_rate": 0.00012677134186568148, + "loss": 0.0224, + "step": 2780 + }, + { + "epoch": 0.7342956957352844, + "grad_norm": 0.6469443440437317, + "learning_rate": 0.00012663939833751155, + "loss": 0.0178, + "step": 2785 + }, + { + "epoch": 0.7356140003954914, + "grad_norm": 0.020773250609636307, + "learning_rate": 0.0001265074548093416, + "loss": 0.011, + "step": 2790 + }, + { + "epoch": 0.7369323050556984, + "grad_norm": 0.7378728985786438, + "learning_rate": 0.00012637551128117167, + "loss": 0.0227, + "step": 2795 + }, + { + "epoch": 0.7382506097159054, + "grad_norm": 0.008189595304429531, + "learning_rate": 0.00012624356775300173, + "loss": 0.0892, + "step": 2800 + }, + { + "epoch": 0.7395689143761123, + "grad_norm": 0.031633853912353516, + "learning_rate": 0.00012611162422483178, + "loss": 0.0093, + "step": 2805 + }, + { + "epoch": 0.7408872190363193, + "grad_norm": 0.5078475475311279, + "learning_rate": 0.00012597968069666185, + "loss": 0.0567, + "step": 2810 + }, + { + "epoch": 0.7422055236965263, + "grad_norm": 0.21766887605190277, + "learning_rate": 0.0001258477371684919, + "loss": 0.0485, + "step": 2815 + }, + { + "epoch": 0.7435238283567333, + "grad_norm": 0.3029612898826599, + "learning_rate": 0.00012571579364032196, + "loss": 0.032, + "step": 2820 + }, + { + "epoch": 0.7448421330169402, + "grad_norm": 1.2135159969329834, + "learning_rate": 0.00012558385011215203, + "loss": 0.0139, + "step": 2825 + }, + { + "epoch": 0.7461604376771472, + "grad_norm": 0.016875172033905983, + "learning_rate": 0.00012545190658398207, + "loss": 0.0323, + "step": 2830 + }, + { + "epoch": 0.7474787423373541, + "grad_norm": 0.08923230320215225, + "learning_rate": 0.00012531996305581214, + "loss": 0.0343, + "step": 2835 + }, + { + "epoch": 0.7487970469975611, + "grad_norm": 0.2958766520023346, + "learning_rate": 0.00012518801952764215, + "loss": 0.0431, + "step": 2840 + }, + { + "epoch": 0.7501153516577681, + "grad_norm": 0.7344386577606201, + "learning_rate": 0.00012505607599947222, + "loss": 0.0389, + "step": 2845 + }, + { + "epoch": 0.7514336563179751, + "grad_norm": 0.03681635856628418, + "learning_rate": 0.0001249241324713023, + "loss": 0.0258, + "step": 2850 + }, + { + "epoch": 0.7527519609781821, + "grad_norm": 0.22866861522197723, + "learning_rate": 0.00012479218894313233, + "loss": 0.0223, + "step": 2855 + }, + { + "epoch": 0.7540702656383891, + "grad_norm": 0.029770435765385628, + "learning_rate": 0.0001246602454149624, + "loss": 0.0205, + "step": 2860 + }, + { + "epoch": 0.755388570298596, + "grad_norm": 0.011845707893371582, + "learning_rate": 0.00012452830188679244, + "loss": 0.0252, + "step": 2865 + }, + { + "epoch": 0.756706874958803, + "grad_norm": 0.06696149706840515, + "learning_rate": 0.00012439635835862251, + "loss": 0.0166, + "step": 2870 + }, + { + "epoch": 0.75802517961901, + "grad_norm": 0.01653144136071205, + "learning_rate": 0.00012426441483045256, + "loss": 0.0487, + "step": 2875 + }, + { + "epoch": 0.7593434842792169, + "grad_norm": 0.031312476843595505, + "learning_rate": 0.00012413247130228263, + "loss": 0.0155, + "step": 2880 + }, + { + "epoch": 0.7606617889394239, + "grad_norm": 0.011625733226537704, + "learning_rate": 0.0001240005277741127, + "loss": 0.0333, + "step": 2885 + }, + { + "epoch": 0.7619800935996309, + "grad_norm": 0.012089414522051811, + "learning_rate": 0.00012386858424594274, + "loss": 0.003, + "step": 2890 + }, + { + "epoch": 0.7632983982598378, + "grad_norm": 0.3012307584285736, + "learning_rate": 0.0001237366407177728, + "loss": 0.0172, + "step": 2895 + }, + { + "epoch": 0.7646167029200448, + "grad_norm": 0.31575000286102295, + "learning_rate": 0.00012360469718960285, + "loss": 0.0409, + "step": 2900 + }, + { + "epoch": 0.7659350075802518, + "grad_norm": 0.009794364683330059, + "learning_rate": 0.00012347275366143292, + "loss": 0.0214, + "step": 2905 + }, + { + "epoch": 0.7672533122404588, + "grad_norm": 0.5973085165023804, + "learning_rate": 0.00012334081013326299, + "loss": 0.0245, + "step": 2910 + }, + { + "epoch": 0.7685716169006658, + "grad_norm": 0.019750040024518967, + "learning_rate": 0.00012320886660509303, + "loss": 0.0063, + "step": 2915 + }, + { + "epoch": 0.7698899215608728, + "grad_norm": 0.06402858346700668, + "learning_rate": 0.0001230769230769231, + "loss": 0.0444, + "step": 2920 + }, + { + "epoch": 0.7712082262210797, + "grad_norm": 0.02876671403646469, + "learning_rate": 0.00012294497954875314, + "loss": 0.0103, + "step": 2925 + }, + { + "epoch": 0.7725265308812866, + "grad_norm": 0.6962207555770874, + "learning_rate": 0.0001228130360205832, + "loss": 0.0318, + "step": 2930 + }, + { + "epoch": 0.7738448355414936, + "grad_norm": 0.006536522414535284, + "learning_rate": 0.00012268109249241325, + "loss": 0.0096, + "step": 2935 + }, + { + "epoch": 0.7751631402017006, + "grad_norm": 0.07097168266773224, + "learning_rate": 0.0001225491489642433, + "loss": 0.0174, + "step": 2940 + }, + { + "epoch": 0.7764814448619076, + "grad_norm": 0.042360126972198486, + "learning_rate": 0.00012241720543607336, + "loss": 0.0158, + "step": 2945 + }, + { + "epoch": 0.7777997495221146, + "grad_norm": 0.01159572321921587, + "learning_rate": 0.0001222852619079034, + "loss": 0.0265, + "step": 2950 + }, + { + "epoch": 0.7791180541823215, + "grad_norm": 0.38408163189888, + "learning_rate": 0.00012215331837973347, + "loss": 0.0233, + "step": 2955 + }, + { + "epoch": 0.7804363588425285, + "grad_norm": 0.15588605403900146, + "learning_rate": 0.00012202137485156353, + "loss": 0.0041, + "step": 2960 + }, + { + "epoch": 0.7817546635027355, + "grad_norm": 0.006892362609505653, + "learning_rate": 0.00012188943132339358, + "loss": 0.0026, + "step": 2965 + }, + { + "epoch": 0.7830729681629425, + "grad_norm": 0.030915727838873863, + "learning_rate": 0.00012175748779522364, + "loss": 0.0028, + "step": 2970 + }, + { + "epoch": 0.7843912728231495, + "grad_norm": 0.8151025772094727, + "learning_rate": 0.00012162554426705371, + "loss": 0.0429, + "step": 2975 + }, + { + "epoch": 0.7857095774833565, + "grad_norm": 0.6765475273132324, + "learning_rate": 0.00012149360073888377, + "loss": 0.0319, + "step": 2980 + }, + { + "epoch": 0.7870278821435633, + "grad_norm": 0.054469238966703415, + "learning_rate": 0.00012136165721071382, + "loss": 0.0413, + "step": 2985 + }, + { + "epoch": 0.7883461868037703, + "grad_norm": 0.045610666275024414, + "learning_rate": 0.00012122971368254388, + "loss": 0.0521, + "step": 2990 + }, + { + "epoch": 0.7896644914639773, + "grad_norm": 0.4222470223903656, + "learning_rate": 0.00012109777015437393, + "loss": 0.0846, + "step": 2995 + }, + { + "epoch": 0.7909827961241843, + "grad_norm": 0.0272397268563509, + "learning_rate": 0.00012096582662620399, + "loss": 0.0364, + "step": 3000 + }, + { + "epoch": 0.7909827961241843, + "eval_loss": 0.033312585204839706, + "eval_runtime": 452.2552, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 3.728, + "step": 3000 + }, + { + "epoch": 0.7923011007843913, + "grad_norm": 0.08674059063196182, + "learning_rate": 0.00012083388309803406, + "loss": 0.0081, + "step": 3005 + }, + { + "epoch": 0.7936194054445982, + "grad_norm": 0.21960832178592682, + "learning_rate": 0.00012070193956986411, + "loss": 0.0468, + "step": 3010 + }, + { + "epoch": 0.7949377101048052, + "grad_norm": 0.11259289085865021, + "learning_rate": 0.00012056999604169417, + "loss": 0.0124, + "step": 3015 + }, + { + "epoch": 0.7962560147650122, + "grad_norm": 0.02945362776517868, + "learning_rate": 0.00012043805251352422, + "loss": 0.0298, + "step": 3020 + }, + { + "epoch": 0.7975743194252192, + "grad_norm": 0.27889615297317505, + "learning_rate": 0.00012030610898535428, + "loss": 0.0251, + "step": 3025 + }, + { + "epoch": 0.7988926240854262, + "grad_norm": 0.05873241275548935, + "learning_rate": 0.00012017416545718434, + "loss": 0.0132, + "step": 3030 + }, + { + "epoch": 0.8002109287456332, + "grad_norm": 0.1570046991109848, + "learning_rate": 0.00012004222192901439, + "loss": 0.0228, + "step": 3035 + }, + { + "epoch": 0.80152923340584, + "grad_norm": 0.12575332820415497, + "learning_rate": 0.00011991027840084443, + "loss": 0.0049, + "step": 3040 + }, + { + "epoch": 0.802847538066047, + "grad_norm": 0.8416435122489929, + "learning_rate": 0.00011977833487267449, + "loss": 0.0542, + "step": 3045 + }, + { + "epoch": 0.804165842726254, + "grad_norm": 0.2605098485946655, + "learning_rate": 0.00011964639134450454, + "loss": 0.0084, + "step": 3050 + }, + { + "epoch": 0.805484147386461, + "grad_norm": 0.8996294736862183, + "learning_rate": 0.00011951444781633461, + "loss": 0.0442, + "step": 3055 + }, + { + "epoch": 0.806802452046668, + "grad_norm": 2.7525105476379395, + "learning_rate": 0.00011938250428816467, + "loss": 0.0642, + "step": 3060 + }, + { + "epoch": 0.808120756706875, + "grad_norm": 0.14955930411815643, + "learning_rate": 0.00011925056075999473, + "loss": 0.0384, + "step": 3065 + }, + { + "epoch": 0.8094390613670819, + "grad_norm": 0.018756115809082985, + "learning_rate": 0.00011911861723182478, + "loss": 0.0154, + "step": 3070 + }, + { + "epoch": 0.8107573660272889, + "grad_norm": 0.23998615145683289, + "learning_rate": 0.00011898667370365484, + "loss": 0.0413, + "step": 3075 + }, + { + "epoch": 0.8120756706874959, + "grad_norm": 0.27253249287605286, + "learning_rate": 0.00011885473017548489, + "loss": 0.0081, + "step": 3080 + }, + { + "epoch": 0.8133939753477029, + "grad_norm": 0.2925993502140045, + "learning_rate": 0.00011872278664731495, + "loss": 0.0332, + "step": 3085 + }, + { + "epoch": 0.8147122800079099, + "grad_norm": 0.5364832878112793, + "learning_rate": 0.00011859084311914502, + "loss": 0.0143, + "step": 3090 + }, + { + "epoch": 0.8160305846681168, + "grad_norm": 0.32104921340942383, + "learning_rate": 0.00011845889959097507, + "loss": 0.0216, + "step": 3095 + }, + { + "epoch": 0.8173488893283237, + "grad_norm": 0.0205856766551733, + "learning_rate": 0.00011832695606280513, + "loss": 0.0346, + "step": 3100 + }, + { + "epoch": 0.8186671939885307, + "grad_norm": 0.2541547417640686, + "learning_rate": 0.00011819501253463518, + "loss": 0.0793, + "step": 3105 + }, + { + "epoch": 0.8199854986487377, + "grad_norm": 0.08333491533994675, + "learning_rate": 0.00011806306900646524, + "loss": 0.0049, + "step": 3110 + }, + { + "epoch": 0.8213038033089447, + "grad_norm": 0.0355968177318573, + "learning_rate": 0.0001179311254782953, + "loss": 0.0051, + "step": 3115 + }, + { + "epoch": 0.8226221079691517, + "grad_norm": 0.06948401033878326, + "learning_rate": 0.00011779918195012536, + "loss": 0.013, + "step": 3120 + }, + { + "epoch": 0.8239404126293587, + "grad_norm": 0.03328891843557358, + "learning_rate": 0.00011766723842195542, + "loss": 0.0122, + "step": 3125 + }, + { + "epoch": 0.8252587172895656, + "grad_norm": 0.013782350346446037, + "learning_rate": 0.00011753529489378548, + "loss": 0.0073, + "step": 3130 + }, + { + "epoch": 0.8265770219497726, + "grad_norm": 0.024390392005443573, + "learning_rate": 0.00011740335136561553, + "loss": 0.0143, + "step": 3135 + }, + { + "epoch": 0.8278953266099796, + "grad_norm": 0.002548128366470337, + "learning_rate": 0.00011727140783744557, + "loss": 0.0027, + "step": 3140 + }, + { + "epoch": 0.8292136312701865, + "grad_norm": 0.11674848943948746, + "learning_rate": 0.00011713946430927563, + "loss": 0.0253, + "step": 3145 + }, + { + "epoch": 0.8305319359303935, + "grad_norm": 0.005774884019047022, + "learning_rate": 0.00011700752078110568, + "loss": 0.0018, + "step": 3150 + }, + { + "epoch": 0.8318502405906005, + "grad_norm": 0.5763069987297058, + "learning_rate": 0.00011687557725293574, + "loss": 0.0119, + "step": 3155 + }, + { + "epoch": 0.8331685452508074, + "grad_norm": 0.0027607593219727278, + "learning_rate": 0.0001167436337247658, + "loss": 0.0279, + "step": 3160 + }, + { + "epoch": 0.8344868499110144, + "grad_norm": 1.859642505645752, + "learning_rate": 0.00011661169019659585, + "loss": 0.0228, + "step": 3165 + }, + { + "epoch": 0.8358051545712214, + "grad_norm": 0.16597022116184235, + "learning_rate": 0.00011647974666842592, + "loss": 0.1228, + "step": 3170 + }, + { + "epoch": 0.8371234592314284, + "grad_norm": 0.33833742141723633, + "learning_rate": 0.00011634780314025598, + "loss": 0.073, + "step": 3175 + }, + { + "epoch": 0.8384417638916354, + "grad_norm": 0.024682912975549698, + "learning_rate": 0.00011621585961208603, + "loss": 0.0042, + "step": 3180 + }, + { + "epoch": 0.8397600685518424, + "grad_norm": 0.05926942452788353, + "learning_rate": 0.00011608391608391609, + "loss": 0.0066, + "step": 3185 + }, + { + "epoch": 0.8410783732120493, + "grad_norm": 0.1414029747247696, + "learning_rate": 0.00011595197255574614, + "loss": 0.0603, + "step": 3190 + }, + { + "epoch": 0.8423966778722562, + "grad_norm": 0.37928736209869385, + "learning_rate": 0.0001158200290275762, + "loss": 0.0266, + "step": 3195 + }, + { + "epoch": 0.8437149825324632, + "grad_norm": 0.018329354003071785, + "learning_rate": 0.00011568808549940627, + "loss": 0.0047, + "step": 3200 + }, + { + "epoch": 0.8450332871926702, + "grad_norm": 0.2993735373020172, + "learning_rate": 0.00011555614197123632, + "loss": 0.0218, + "step": 3205 + }, + { + "epoch": 0.8463515918528772, + "grad_norm": 0.1767728328704834, + "learning_rate": 0.00011542419844306638, + "loss": 0.0363, + "step": 3210 + }, + { + "epoch": 0.8476698965130842, + "grad_norm": 0.39774414896965027, + "learning_rate": 0.00011529225491489644, + "loss": 0.0506, + "step": 3215 + }, + { + "epoch": 0.8489882011732911, + "grad_norm": 0.021896762773394585, + "learning_rate": 0.00011516031138672649, + "loss": 0.0081, + "step": 3220 + }, + { + "epoch": 0.8503065058334981, + "grad_norm": 0.358372300863266, + "learning_rate": 0.00011502836785855655, + "loss": 0.0224, + "step": 3225 + }, + { + "epoch": 0.8516248104937051, + "grad_norm": 0.01605542004108429, + "learning_rate": 0.00011489642433038662, + "loss": 0.0215, + "step": 3230 + }, + { + "epoch": 0.8529431151539121, + "grad_norm": 0.021189266815781593, + "learning_rate": 0.00011476448080221667, + "loss": 0.0051, + "step": 3235 + }, + { + "epoch": 0.8542614198141191, + "grad_norm": 0.013394076377153397, + "learning_rate": 0.0001146325372740467, + "loss": 0.021, + "step": 3240 + }, + { + "epoch": 0.8555797244743261, + "grad_norm": 0.19848507642745972, + "learning_rate": 0.00011450059374587676, + "loss": 0.0285, + "step": 3245 + }, + { + "epoch": 0.856898029134533, + "grad_norm": 0.2463046759366989, + "learning_rate": 0.00011436865021770683, + "loss": 0.0384, + "step": 3250 + }, + { + "epoch": 0.8582163337947399, + "grad_norm": 0.37432390451431274, + "learning_rate": 0.00011423670668953688, + "loss": 0.0098, + "step": 3255 + }, + { + "epoch": 0.8595346384549469, + "grad_norm": 0.060943394899368286, + "learning_rate": 0.00011410476316136694, + "loss": 0.0087, + "step": 3260 + }, + { + "epoch": 0.8608529431151539, + "grad_norm": 0.2846696674823761, + "learning_rate": 0.00011397281963319699, + "loss": 0.0148, + "step": 3265 + }, + { + "epoch": 0.8621712477753609, + "grad_norm": 0.009311323054134846, + "learning_rate": 0.00011384087610502705, + "loss": 0.0024, + "step": 3270 + }, + { + "epoch": 0.8634895524355679, + "grad_norm": 0.046277035027742386, + "learning_rate": 0.0001137089325768571, + "loss": 0.0274, + "step": 3275 + }, + { + "epoch": 0.8648078570957748, + "grad_norm": 0.006024620030075312, + "learning_rate": 0.00011357698904868716, + "loss": 0.0286, + "step": 3280 + }, + { + "epoch": 0.8661261617559818, + "grad_norm": 0.033578380942344666, + "learning_rate": 0.00011344504552051723, + "loss": 0.0153, + "step": 3285 + }, + { + "epoch": 0.8674444664161888, + "grad_norm": 0.8537917137145996, + "learning_rate": 0.00011331310199234728, + "loss": 0.0304, + "step": 3290 + }, + { + "epoch": 0.8687627710763958, + "grad_norm": 0.013933337293565273, + "learning_rate": 0.00011318115846417734, + "loss": 0.0112, + "step": 3295 + }, + { + "epoch": 0.8700810757366028, + "grad_norm": 0.35437721014022827, + "learning_rate": 0.0001130492149360074, + "loss": 0.0228, + "step": 3300 + }, + { + "epoch": 0.8713993803968098, + "grad_norm": 1.3024121522903442, + "learning_rate": 0.00011291727140783745, + "loss": 0.0203, + "step": 3305 + }, + { + "epoch": 0.8727176850570166, + "grad_norm": 0.5131255984306335, + "learning_rate": 0.00011278532787966751, + "loss": 0.0181, + "step": 3310 + }, + { + "epoch": 0.8740359897172236, + "grad_norm": 0.039366886019706726, + "learning_rate": 0.00011265338435149758, + "loss": 0.0192, + "step": 3315 + }, + { + "epoch": 0.8753542943774306, + "grad_norm": 0.13679669797420502, + "learning_rate": 0.00011252144082332763, + "loss": 0.004, + "step": 3320 + }, + { + "epoch": 0.8766725990376376, + "grad_norm": 0.003076886525377631, + "learning_rate": 0.00011238949729515769, + "loss": 0.0405, + "step": 3325 + }, + { + "epoch": 0.8779909036978446, + "grad_norm": 0.019953785464167595, + "learning_rate": 0.00011225755376698774, + "loss": 0.0241, + "step": 3330 + }, + { + "epoch": 0.8793092083580516, + "grad_norm": 0.007980377413332462, + "learning_rate": 0.0001121256102388178, + "loss": 0.0064, + "step": 3335 + }, + { + "epoch": 0.8806275130182585, + "grad_norm": 0.018761295825242996, + "learning_rate": 0.00011199366671064784, + "loss": 0.0032, + "step": 3340 + }, + { + "epoch": 0.8819458176784655, + "grad_norm": 0.022511709481477737, + "learning_rate": 0.0001118617231824779, + "loss": 0.0055, + "step": 3345 + }, + { + "epoch": 0.8832641223386725, + "grad_norm": 0.021270718425512314, + "learning_rate": 0.00011172977965430795, + "loss": 0.033, + "step": 3350 + }, + { + "epoch": 0.8845824269988795, + "grad_norm": 0.02710561640560627, + "learning_rate": 0.00011159783612613801, + "loss": 0.0094, + "step": 3355 + }, + { + "epoch": 0.8859007316590864, + "grad_norm": 0.4353378117084503, + "learning_rate": 0.00011146589259796806, + "loss": 0.0089, + "step": 3360 + }, + { + "epoch": 0.8872190363192934, + "grad_norm": 0.0257766991853714, + "learning_rate": 0.00011133394906979813, + "loss": 0.0059, + "step": 3365 + }, + { + "epoch": 0.8885373409795003, + "grad_norm": 0.80838942527771, + "learning_rate": 0.00011120200554162819, + "loss": 0.0263, + "step": 3370 + }, + { + "epoch": 0.8898556456397073, + "grad_norm": 0.007799761835485697, + "learning_rate": 0.00011107006201345824, + "loss": 0.0028, + "step": 3375 + }, + { + "epoch": 0.8911739502999143, + "grad_norm": 0.007315775845199823, + "learning_rate": 0.0001109381184852883, + "loss": 0.0127, + "step": 3380 + }, + { + "epoch": 0.8924922549601213, + "grad_norm": 1.4861233234405518, + "learning_rate": 0.00011080617495711836, + "loss": 0.0562, + "step": 3385 + }, + { + "epoch": 0.8938105596203283, + "grad_norm": 0.010219530202448368, + "learning_rate": 0.00011067423142894841, + "loss": 0.0438, + "step": 3390 + }, + { + "epoch": 0.8951288642805353, + "grad_norm": 1.0191857814788818, + "learning_rate": 0.00011054228790077848, + "loss": 0.0493, + "step": 3395 + }, + { + "epoch": 0.8964471689407422, + "grad_norm": 0.01459536887705326, + "learning_rate": 0.00011041034437260854, + "loss": 0.0117, + "step": 3400 + }, + { + "epoch": 0.8977654736009492, + "grad_norm": 0.008682495914399624, + "learning_rate": 0.00011027840084443859, + "loss": 0.02, + "step": 3405 + }, + { + "epoch": 0.8990837782611562, + "grad_norm": 0.02197263017296791, + "learning_rate": 0.00011014645731626865, + "loss": 0.0454, + "step": 3410 + }, + { + "epoch": 0.9004020829213631, + "grad_norm": 0.01436714269220829, + "learning_rate": 0.0001100145137880987, + "loss": 0.0283, + "step": 3415 + }, + { + "epoch": 0.9017203875815701, + "grad_norm": 0.14327946305274963, + "learning_rate": 0.00010988257025992876, + "loss": 0.0461, + "step": 3420 + }, + { + "epoch": 0.9030386922417771, + "grad_norm": 1.671773910522461, + "learning_rate": 0.00010975062673175883, + "loss": 0.054, + "step": 3425 + }, + { + "epoch": 0.904356996901984, + "grad_norm": 0.009926804341375828, + "learning_rate": 0.00010961868320358888, + "loss": 0.0429, + "step": 3430 + }, + { + "epoch": 0.905675301562191, + "grad_norm": 0.554020881652832, + "learning_rate": 0.00010948673967541894, + "loss": 0.0618, + "step": 3435 + }, + { + "epoch": 0.906993606222398, + "grad_norm": 0.1399248093366623, + "learning_rate": 0.00010935479614724897, + "loss": 0.0229, + "step": 3440 + }, + { + "epoch": 0.908311910882605, + "grad_norm": 0.02739197015762329, + "learning_rate": 0.00010922285261907904, + "loss": 0.0082, + "step": 3445 + }, + { + "epoch": 0.909630215542812, + "grad_norm": 0.33394527435302734, + "learning_rate": 0.00010909090909090909, + "loss": 0.0403, + "step": 3450 + }, + { + "epoch": 0.9109485202030189, + "grad_norm": 0.08083894103765488, + "learning_rate": 0.00010895896556273915, + "loss": 0.0406, + "step": 3455 + }, + { + "epoch": 0.9122668248632259, + "grad_norm": 0.39336663484573364, + "learning_rate": 0.0001088270220345692, + "loss": 0.02, + "step": 3460 + }, + { + "epoch": 0.9135851295234328, + "grad_norm": 0.20481553673744202, + "learning_rate": 0.00010869507850639926, + "loss": 0.0221, + "step": 3465 + }, + { + "epoch": 0.9149034341836398, + "grad_norm": 1.4507408142089844, + "learning_rate": 0.00010856313497822932, + "loss": 0.0357, + "step": 3470 + }, + { + "epoch": 0.9162217388438468, + "grad_norm": 0.2678806483745575, + "learning_rate": 0.00010843119145005937, + "loss": 0.0181, + "step": 3475 + }, + { + "epoch": 0.9175400435040538, + "grad_norm": 0.007361674215644598, + "learning_rate": 0.00010829924792188944, + "loss": 0.0978, + "step": 3480 + }, + { + "epoch": 0.9188583481642607, + "grad_norm": 0.773695707321167, + "learning_rate": 0.0001081673043937195, + "loss": 0.0401, + "step": 3485 + }, + { + "epoch": 0.9201766528244677, + "grad_norm": 0.0010772625682875514, + "learning_rate": 0.00010803536086554955, + "loss": 0.0233, + "step": 3490 + }, + { + "epoch": 0.9214949574846747, + "grad_norm": 0.08971104770898819, + "learning_rate": 0.00010790341733737961, + "loss": 0.0319, + "step": 3495 + }, + { + "epoch": 0.9228132621448817, + "grad_norm": 0.21372731029987335, + "learning_rate": 0.00010777147380920966, + "loss": 0.0315, + "step": 3500 + }, + { + "epoch": 0.9228132621448817, + "eval_loss": 0.02952708676457405, + "eval_runtime": 451.5837, + "eval_samples_per_second": 7.467, + "eval_steps_per_second": 3.734, + "step": 3500 + }, + { + "epoch": 0.9241315668050887, + "grad_norm": 0.016639264300465584, + "learning_rate": 0.00010763953028103972, + "loss": 0.0125, + "step": 3505 + }, + { + "epoch": 0.9254498714652957, + "grad_norm": 0.46340492367744446, + "learning_rate": 0.00010750758675286979, + "loss": 0.0186, + "step": 3510 + }, + { + "epoch": 0.9267681761255026, + "grad_norm": 0.01847526989877224, + "learning_rate": 0.00010737564322469984, + "loss": 0.0026, + "step": 3515 + }, + { + "epoch": 0.9280864807857095, + "grad_norm": 0.5947860479354858, + "learning_rate": 0.0001072436996965299, + "loss": 0.0259, + "step": 3520 + }, + { + "epoch": 0.9294047854459165, + "grad_norm": 0.06145291402935982, + "learning_rate": 0.00010711175616835995, + "loss": 0.0057, + "step": 3525 + }, + { + "epoch": 0.9307230901061235, + "grad_norm": 0.0143959429115057, + "learning_rate": 0.00010697981264019001, + "loss": 0.0145, + "step": 3530 + }, + { + "epoch": 0.9320413947663305, + "grad_norm": 0.21143831312656403, + "learning_rate": 0.00010684786911202007, + "loss": 0.0459, + "step": 3535 + }, + { + "epoch": 0.9333596994265375, + "grad_norm": 0.02548077143728733, + "learning_rate": 0.00010671592558385011, + "loss": 0.0051, + "step": 3540 + }, + { + "epoch": 0.9346780040867444, + "grad_norm": 0.008077048696577549, + "learning_rate": 0.00010658398205568016, + "loss": 0.0306, + "step": 3545 + }, + { + "epoch": 0.9359963087469514, + "grad_norm": 0.0030760422814637423, + "learning_rate": 0.00010645203852751022, + "loss": 0.0575, + "step": 3550 + }, + { + "epoch": 0.9373146134071584, + "grad_norm": 0.18114158511161804, + "learning_rate": 0.00010632009499934027, + "loss": 0.0885, + "step": 3555 + }, + { + "epoch": 0.9386329180673654, + "grad_norm": 0.02450549602508545, + "learning_rate": 0.00010618815147117034, + "loss": 0.0045, + "step": 3560 + }, + { + "epoch": 0.9399512227275724, + "grad_norm": 0.1238626018166542, + "learning_rate": 0.0001060562079430004, + "loss": 0.0166, + "step": 3565 + }, + { + "epoch": 0.9412695273877794, + "grad_norm": 0.1879919469356537, + "learning_rate": 0.00010592426441483046, + "loss": 0.0077, + "step": 3570 + }, + { + "epoch": 0.9425878320479862, + "grad_norm": 0.11323565989732742, + "learning_rate": 0.00010579232088666051, + "loss": 0.0213, + "step": 3575 + }, + { + "epoch": 0.9439061367081932, + "grad_norm": 0.35575854778289795, + "learning_rate": 0.00010566037735849057, + "loss": 0.0336, + "step": 3580 + }, + { + "epoch": 0.9452244413684002, + "grad_norm": 0.14052227139472961, + "learning_rate": 0.00010552843383032062, + "loss": 0.0325, + "step": 3585 + }, + { + "epoch": 0.9465427460286072, + "grad_norm": 0.2643798887729645, + "learning_rate": 0.00010539649030215069, + "loss": 0.0192, + "step": 3590 + }, + { + "epoch": 0.9478610506888142, + "grad_norm": 0.3207031190395355, + "learning_rate": 0.00010526454677398075, + "loss": 0.0221, + "step": 3595 + }, + { + "epoch": 0.9491793553490212, + "grad_norm": 0.022803861647844315, + "learning_rate": 0.0001051326032458108, + "loss": 0.029, + "step": 3600 + }, + { + "epoch": 0.9504976600092281, + "grad_norm": 0.02511664852499962, + "learning_rate": 0.00010500065971764086, + "loss": 0.0422, + "step": 3605 + }, + { + "epoch": 0.9518159646694351, + "grad_norm": 0.06505445390939713, + "learning_rate": 0.00010486871618947091, + "loss": 0.0092, + "step": 3610 + }, + { + "epoch": 0.9531342693296421, + "grad_norm": 0.09998584538698196, + "learning_rate": 0.00010473677266130097, + "loss": 0.0242, + "step": 3615 + }, + { + "epoch": 0.9544525739898491, + "grad_norm": 0.9645698666572571, + "learning_rate": 0.00010460482913313104, + "loss": 0.0124, + "step": 3620 + }, + { + "epoch": 0.955770878650056, + "grad_norm": 0.2389964610338211, + "learning_rate": 0.0001044728856049611, + "loss": 0.0169, + "step": 3625 + }, + { + "epoch": 0.957089183310263, + "grad_norm": 2.030608654022217, + "learning_rate": 0.00010434094207679115, + "loss": 0.0518, + "step": 3630 + }, + { + "epoch": 0.9584074879704699, + "grad_norm": 0.05979987606406212, + "learning_rate": 0.0001042089985486212, + "loss": 0.0081, + "step": 3635 + }, + { + "epoch": 0.9597257926306769, + "grad_norm": 0.15761719644069672, + "learning_rate": 0.00010407705502045125, + "loss": 0.0061, + "step": 3640 + }, + { + "epoch": 0.9610440972908839, + "grad_norm": 0.6534290909767151, + "learning_rate": 0.0001039451114922813, + "loss": 0.0104, + "step": 3645 + }, + { + "epoch": 0.9623624019510909, + "grad_norm": 1.0324147939682007, + "learning_rate": 0.00010381316796411136, + "loss": 0.0381, + "step": 3650 + }, + { + "epoch": 0.9636807066112979, + "grad_norm": 0.002968872431665659, + "learning_rate": 0.00010368122443594142, + "loss": 0.0343, + "step": 3655 + }, + { + "epoch": 0.9649990112715049, + "grad_norm": 0.011243184097111225, + "learning_rate": 0.00010354928090777147, + "loss": 0.019, + "step": 3660 + }, + { + "epoch": 0.9663173159317118, + "grad_norm": 0.17663739621639252, + "learning_rate": 0.00010341733737960153, + "loss": 0.0452, + "step": 3665 + }, + { + "epoch": 0.9676356205919188, + "grad_norm": 1.2647719383239746, + "learning_rate": 0.00010328539385143158, + "loss": 0.0154, + "step": 3670 + }, + { + "epoch": 0.9689539252521258, + "grad_norm": 0.3691752552986145, + "learning_rate": 0.00010315345032326165, + "loss": 0.028, + "step": 3675 + }, + { + "epoch": 0.9702722299123328, + "grad_norm": 0.0015879774000495672, + "learning_rate": 0.00010302150679509171, + "loss": 0.0202, + "step": 3680 + }, + { + "epoch": 0.9715905345725397, + "grad_norm": 0.1441984623670578, + "learning_rate": 0.00010288956326692176, + "loss": 0.0221, + "step": 3685 + }, + { + "epoch": 0.9729088392327467, + "grad_norm": 0.20431455969810486, + "learning_rate": 0.00010275761973875182, + "loss": 0.0072, + "step": 3690 + }, + { + "epoch": 0.9742271438929536, + "grad_norm": 0.861625611782074, + "learning_rate": 0.00010262567621058187, + "loss": 0.0523, + "step": 3695 + }, + { + "epoch": 0.9755454485531606, + "grad_norm": 0.005049478262662888, + "learning_rate": 0.00010249373268241193, + "loss": 0.0051, + "step": 3700 + }, + { + "epoch": 0.9768637532133676, + "grad_norm": 0.49685510993003845, + "learning_rate": 0.000102361789154242, + "loss": 0.023, + "step": 3705 + }, + { + "epoch": 0.9781820578735746, + "grad_norm": 0.08789395540952682, + "learning_rate": 0.00010222984562607205, + "loss": 0.0159, + "step": 3710 + }, + { + "epoch": 0.9795003625337816, + "grad_norm": 0.027168691158294678, + "learning_rate": 0.00010209790209790211, + "loss": 0.0083, + "step": 3715 + }, + { + "epoch": 0.9808186671939886, + "grad_norm": 0.0006773864733986557, + "learning_rate": 0.00010196595856973217, + "loss": 0.0048, + "step": 3720 + }, + { + "epoch": 0.9821369718541955, + "grad_norm": 0.01636457070708275, + "learning_rate": 0.00010183401504156222, + "loss": 0.0159, + "step": 3725 + }, + { + "epoch": 0.9834552765144025, + "grad_norm": 0.10160859674215317, + "learning_rate": 0.00010170207151339228, + "loss": 0.0047, + "step": 3730 + }, + { + "epoch": 0.9847735811746094, + "grad_norm": 0.14173269271850586, + "learning_rate": 0.00010157012798522232, + "loss": 0.006, + "step": 3735 + }, + { + "epoch": 0.9860918858348164, + "grad_norm": 0.003458512481302023, + "learning_rate": 0.00010143818445705238, + "loss": 0.0193, + "step": 3740 + }, + { + "epoch": 0.9874101904950234, + "grad_norm": 0.005163820460438728, + "learning_rate": 0.00010130624092888243, + "loss": 0.0039, + "step": 3745 + }, + { + "epoch": 0.9887284951552304, + "grad_norm": 0.005913791712373495, + "learning_rate": 0.00010117429740071249, + "loss": 0.0119, + "step": 3750 + }, + { + "epoch": 0.9900467998154373, + "grad_norm": 0.00800853967666626, + "learning_rate": 0.00010104235387254256, + "loss": 0.044, + "step": 3755 + }, + { + "epoch": 0.9913651044756443, + "grad_norm": 0.18146778643131256, + "learning_rate": 0.00010091041034437261, + "loss": 0.0048, + "step": 3760 + }, + { + "epoch": 0.9926834091358513, + "grad_norm": 0.01235104724764824, + "learning_rate": 0.00010077846681620267, + "loss": 0.0017, + "step": 3765 + }, + { + "epoch": 0.9940017137960583, + "grad_norm": 0.17677897214889526, + "learning_rate": 0.00010064652328803272, + "loss": 0.0339, + "step": 3770 + }, + { + "epoch": 0.9953200184562653, + "grad_norm": 0.0017472271574661136, + "learning_rate": 0.00010051457975986278, + "loss": 0.0494, + "step": 3775 + }, + { + "epoch": 0.9966383231164723, + "grad_norm": 0.10814860463142395, + "learning_rate": 0.00010038263623169283, + "loss": 0.0741, + "step": 3780 + }, + { + "epoch": 0.9979566277766792, + "grad_norm": 0.11329760402441025, + "learning_rate": 0.0001002506927035229, + "loss": 0.0182, + "step": 3785 + }, + { + "epoch": 0.9992749324368861, + "grad_norm": 0.11573276668787003, + "learning_rate": 0.00010011874917535296, + "loss": 0.0068, + "step": 3790 + }, + { + "epoch": 1.000790982796124, + "grad_norm": 0.08449886739253998, + "learning_rate": 9.998680564718301e-05, + "loss": 0.0141, + "step": 3795 + }, + { + "epoch": 1.002109287456331, + "grad_norm": 0.05035184696316719, + "learning_rate": 9.985486211901307e-05, + "loss": 0.0293, + "step": 3800 + }, + { + "epoch": 1.003427592116538, + "grad_norm": 0.0255444198846817, + "learning_rate": 9.972291859084313e-05, + "loss": 0.0054, + "step": 3805 + }, + { + "epoch": 1.004745896776745, + "grad_norm": 0.0033677336759865284, + "learning_rate": 9.959097506267318e-05, + "loss": 0.0567, + "step": 3810 + }, + { + "epoch": 1.006064201436952, + "grad_norm": 0.09453682601451874, + "learning_rate": 9.945903153450324e-05, + "loss": 0.0589, + "step": 3815 + }, + { + "epoch": 1.007382506097159, + "grad_norm": 0.01592979207634926, + "learning_rate": 9.932708800633329e-05, + "loss": 0.0043, + "step": 3820 + }, + { + "epoch": 1.008700810757366, + "grad_norm": 0.002263693604618311, + "learning_rate": 9.919514447816335e-05, + "loss": 0.0195, + "step": 3825 + }, + { + "epoch": 1.010019115417573, + "grad_norm": 0.013390793465077877, + "learning_rate": 9.90632009499934e-05, + "loss": 0.0152, + "step": 3830 + }, + { + "epoch": 1.01133742007778, + "grad_norm": 0.10473847389221191, + "learning_rate": 9.893125742182346e-05, + "loss": 0.0606, + "step": 3835 + }, + { + "epoch": 1.012655724737987, + "grad_norm": 0.05837221071124077, + "learning_rate": 9.879931389365353e-05, + "loss": 0.0121, + "step": 3840 + }, + { + "epoch": 1.013974029398194, + "grad_norm": 0.3803791105747223, + "learning_rate": 9.866737036548358e-05, + "loss": 0.0386, + "step": 3845 + }, + { + "epoch": 1.0152923340584008, + "grad_norm": 0.4067519009113312, + "learning_rate": 9.853542683731364e-05, + "loss": 0.0115, + "step": 3850 + }, + { + "epoch": 1.0166106387186078, + "grad_norm": 0.02585229091346264, + "learning_rate": 9.84034833091437e-05, + "loss": 0.0214, + "step": 3855 + }, + { + "epoch": 1.0179289433788148, + "grad_norm": 0.03670825809240341, + "learning_rate": 9.827153978097374e-05, + "loss": 0.0059, + "step": 3860 + }, + { + "epoch": 1.0192472480390218, + "grad_norm": 0.014171554706990719, + "learning_rate": 9.81395962528038e-05, + "loss": 0.0145, + "step": 3865 + }, + { + "epoch": 1.0205655526992288, + "grad_norm": 0.027376385405659676, + "learning_rate": 9.800765272463386e-05, + "loss": 0.0089, + "step": 3870 + }, + { + "epoch": 1.0218838573594358, + "grad_norm": 0.03168405964970589, + "learning_rate": 9.787570919646392e-05, + "loss": 0.0132, + "step": 3875 + }, + { + "epoch": 1.0232021620196428, + "grad_norm": 0.03346199914813042, + "learning_rate": 9.774376566829397e-05, + "loss": 0.0246, + "step": 3880 + }, + { + "epoch": 1.0245204666798498, + "grad_norm": 0.00894144270569086, + "learning_rate": 9.761182214012403e-05, + "loss": 0.0105, + "step": 3885 + }, + { + "epoch": 1.0258387713400567, + "grad_norm": 0.3172806203365326, + "learning_rate": 9.747987861195409e-05, + "loss": 0.0103, + "step": 3890 + }, + { + "epoch": 1.0271570760002637, + "grad_norm": 0.009055040776729584, + "learning_rate": 9.734793508378414e-05, + "loss": 0.0103, + "step": 3895 + }, + { + "epoch": 1.0284753806604707, + "grad_norm": 0.014140011742711067, + "learning_rate": 9.721599155561421e-05, + "loss": 0.0037, + "step": 3900 + }, + { + "epoch": 1.0297936853206777, + "grad_norm": 0.008317383006215096, + "learning_rate": 9.708404802744427e-05, + "loss": 0.002, + "step": 3905 + }, + { + "epoch": 1.0311119899808845, + "grad_norm": 0.005038558971136808, + "learning_rate": 9.695210449927431e-05, + "loss": 0.0017, + "step": 3910 + }, + { + "epoch": 1.0324302946410915, + "grad_norm": 0.40058520436286926, + "learning_rate": 9.682016097110436e-05, + "loss": 0.0065, + "step": 3915 + }, + { + "epoch": 1.0337485993012985, + "grad_norm": 0.005197151098400354, + "learning_rate": 9.668821744293442e-05, + "loss": 0.0031, + "step": 3920 + }, + { + "epoch": 1.0350669039615055, + "grad_norm": 0.014353781007230282, + "learning_rate": 9.655627391476449e-05, + "loss": 0.0009, + "step": 3925 + }, + { + "epoch": 1.0363852086217125, + "grad_norm": 0.13260559737682343, + "learning_rate": 9.642433038659454e-05, + "loss": 0.0323, + "step": 3930 + }, + { + "epoch": 1.0377035132819195, + "grad_norm": 0.006795065477490425, + "learning_rate": 9.62923868584246e-05, + "loss": 0.0022, + "step": 3935 + }, + { + "epoch": 1.0390218179421264, + "grad_norm": 0.2276086062192917, + "learning_rate": 9.616044333025466e-05, + "loss": 0.0221, + "step": 3940 + }, + { + "epoch": 1.0403401226023334, + "grad_norm": 0.06121920794248581, + "learning_rate": 9.602849980208471e-05, + "loss": 0.0037, + "step": 3945 + }, + { + "epoch": 1.0416584272625404, + "grad_norm": 0.9180755019187927, + "learning_rate": 9.589655627391477e-05, + "loss": 0.0589, + "step": 3950 + }, + { + "epoch": 1.0429767319227474, + "grad_norm": 0.07515591382980347, + "learning_rate": 9.576461274574484e-05, + "loss": 0.0653, + "step": 3955 + }, + { + "epoch": 1.0442950365829544, + "grad_norm": 0.018060607835650444, + "learning_rate": 9.563266921757488e-05, + "loss": 0.0178, + "step": 3960 + }, + { + "epoch": 1.0456133412431612, + "grad_norm": 0.02751368284225464, + "learning_rate": 9.550072568940493e-05, + "loss": 0.0076, + "step": 3965 + }, + { + "epoch": 1.0469316459033682, + "grad_norm": 0.653998613357544, + "learning_rate": 9.536878216123499e-05, + "loss": 0.0066, + "step": 3970 + }, + { + "epoch": 1.0482499505635752, + "grad_norm": 0.3117768168449402, + "learning_rate": 9.523683863306505e-05, + "loss": 0.0087, + "step": 3975 + }, + { + "epoch": 1.0495682552237822, + "grad_norm": 0.013952831737697124, + "learning_rate": 9.510489510489511e-05, + "loss": 0.0037, + "step": 3980 + }, + { + "epoch": 1.0508865598839892, + "grad_norm": 0.01806250400841236, + "learning_rate": 9.497295157672517e-05, + "loss": 0.0028, + "step": 3985 + }, + { + "epoch": 1.0522048645441962, + "grad_norm": 0.13678006827831268, + "learning_rate": 9.484100804855523e-05, + "loss": 0.0533, + "step": 3990 + }, + { + "epoch": 1.0535231692044031, + "grad_norm": 0.14869382977485657, + "learning_rate": 9.470906452038528e-05, + "loss": 0.009, + "step": 3995 + }, + { + "epoch": 1.0548414738646101, + "grad_norm": 0.33614659309387207, + "learning_rate": 9.457712099221534e-05, + "loss": 0.0555, + "step": 4000 + }, + { + "epoch": 1.0548414738646101, + "eval_loss": 0.026165226474404335, + "eval_runtime": 452.2482, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 3.728, + "step": 4000 + }, + { + "epoch": 1.0561597785248171, + "grad_norm": 0.007546027656644583, + "learning_rate": 9.444517746404539e-05, + "loss": 0.0029, + "step": 4005 + }, + { + "epoch": 1.0574780831850241, + "grad_norm": 0.3720332384109497, + "learning_rate": 9.431323393587545e-05, + "loss": 0.0353, + "step": 4010 + }, + { + "epoch": 1.0587963878452311, + "grad_norm": 1.1335264444351196, + "learning_rate": 9.41812904077055e-05, + "loss": 0.0142, + "step": 4015 + }, + { + "epoch": 1.060114692505438, + "grad_norm": 0.024723488837480545, + "learning_rate": 9.404934687953556e-05, + "loss": 0.006, + "step": 4020 + }, + { + "epoch": 1.0614329971656449, + "grad_norm": 0.040354058146476746, + "learning_rate": 9.391740335136562e-05, + "loss": 0.0107, + "step": 4025 + }, + { + "epoch": 1.0627513018258519, + "grad_norm": 0.222810298204422, + "learning_rate": 9.378545982319567e-05, + "loss": 0.0273, + "step": 4030 + }, + { + "epoch": 1.0640696064860589, + "grad_norm": 0.025684095919132233, + "learning_rate": 9.365351629502574e-05, + "loss": 0.0033, + "step": 4035 + }, + { + "epoch": 1.0653879111462659, + "grad_norm": 0.05338352546095848, + "learning_rate": 9.35215727668558e-05, + "loss": 0.0052, + "step": 4040 + }, + { + "epoch": 1.0667062158064728, + "grad_norm": 0.06182330474257469, + "learning_rate": 9.338962923868585e-05, + "loss": 0.0038, + "step": 4045 + }, + { + "epoch": 1.0680245204666798, + "grad_norm": 0.012170832604169846, + "learning_rate": 9.325768571051591e-05, + "loss": 0.0018, + "step": 4050 + }, + { + "epoch": 1.0693428251268868, + "grad_norm": 0.5424306392669678, + "learning_rate": 9.312574218234596e-05, + "loss": 0.0445, + "step": 4055 + }, + { + "epoch": 1.0706611297870938, + "grad_norm": 0.017939254641532898, + "learning_rate": 9.299379865417602e-05, + "loss": 0.0389, + "step": 4060 + }, + { + "epoch": 1.0719794344473008, + "grad_norm": 0.0060431682504713535, + "learning_rate": 9.286185512600607e-05, + "loss": 0.0025, + "step": 4065 + }, + { + "epoch": 1.0732977391075078, + "grad_norm": 0.0071444883942604065, + "learning_rate": 9.272991159783613e-05, + "loss": 0.0333, + "step": 4070 + }, + { + "epoch": 1.0746160437677148, + "grad_norm": 0.29632750153541565, + "learning_rate": 9.259796806966619e-05, + "loss": 0.0151, + "step": 4075 + }, + { + "epoch": 1.0759343484279218, + "grad_norm": 0.004526323173195124, + "learning_rate": 9.246602454149624e-05, + "loss": 0.006, + "step": 4080 + }, + { + "epoch": 1.0772526530881286, + "grad_norm": 0.023945212364196777, + "learning_rate": 9.23340810133263e-05, + "loss": 0.004, + "step": 4085 + }, + { + "epoch": 1.0785709577483356, + "grad_norm": 0.13235126435756683, + "learning_rate": 9.220213748515635e-05, + "loss": 0.0059, + "step": 4090 + }, + { + "epoch": 1.0798892624085425, + "grad_norm": 0.17592330276966095, + "learning_rate": 9.207019395698642e-05, + "loss": 0.0302, + "step": 4095 + }, + { + "epoch": 1.0812075670687495, + "grad_norm": 0.004582866560667753, + "learning_rate": 9.193825042881648e-05, + "loss": 0.009, + "step": 4100 + }, + { + "epoch": 1.0825258717289565, + "grad_norm": 0.15214525163173676, + "learning_rate": 9.180630690064653e-05, + "loss": 0.0062, + "step": 4105 + }, + { + "epoch": 1.0838441763891635, + "grad_norm": 0.16535983979701996, + "learning_rate": 9.167436337247658e-05, + "loss": 0.0926, + "step": 4110 + }, + { + "epoch": 1.0851624810493705, + "grad_norm": 0.013285227119922638, + "learning_rate": 9.154241984430663e-05, + "loss": 0.0043, + "step": 4115 + }, + { + "epoch": 1.0864807857095775, + "grad_norm": 0.012116984464228153, + "learning_rate": 9.14104763161367e-05, + "loss": 0.0037, + "step": 4120 + }, + { + "epoch": 1.0877990903697845, + "grad_norm": 0.0373845212161541, + "learning_rate": 9.127853278796676e-05, + "loss": 0.0081, + "step": 4125 + }, + { + "epoch": 1.0891173950299915, + "grad_norm": 0.09324615448713303, + "learning_rate": 9.114658925979681e-05, + "loss": 0.0534, + "step": 4130 + }, + { + "epoch": 1.0904356996901985, + "grad_norm": 0.010992968454957008, + "learning_rate": 9.101464573162687e-05, + "loss": 0.0025, + "step": 4135 + }, + { + "epoch": 1.0917540043504055, + "grad_norm": 0.13710318505764008, + "learning_rate": 9.088270220345692e-05, + "loss": 0.0555, + "step": 4140 + }, + { + "epoch": 1.0930723090106123, + "grad_norm": 0.010403074324131012, + "learning_rate": 9.075075867528698e-05, + "loss": 0.0042, + "step": 4145 + }, + { + "epoch": 1.0943906136708192, + "grad_norm": 0.21544460952281952, + "learning_rate": 9.061881514711705e-05, + "loss": 0.0144, + "step": 4150 + }, + { + "epoch": 1.0957089183310262, + "grad_norm": 0.04194799065589905, + "learning_rate": 9.04868716189471e-05, + "loss": 0.0106, + "step": 4155 + }, + { + "epoch": 1.0970272229912332, + "grad_norm": 0.029204202815890312, + "learning_rate": 9.035492809077715e-05, + "loss": 0.0085, + "step": 4160 + }, + { + "epoch": 1.0983455276514402, + "grad_norm": 0.006751026958227158, + "learning_rate": 9.02229845626072e-05, + "loss": 0.0049, + "step": 4165 + }, + { + "epoch": 1.0996638323116472, + "grad_norm": 0.008232722990214825, + "learning_rate": 9.009104103443726e-05, + "loss": 0.0172, + "step": 4170 + }, + { + "epoch": 1.1009821369718542, + "grad_norm": 0.05630079656839371, + "learning_rate": 8.995909750626733e-05, + "loss": 0.0112, + "step": 4175 + }, + { + "epoch": 1.1023004416320612, + "grad_norm": 0.0011601662263274193, + "learning_rate": 8.982715397809738e-05, + "loss": 0.0317, + "step": 4180 + }, + { + "epoch": 1.1036187462922682, + "grad_norm": 0.006554402410984039, + "learning_rate": 8.969521044992744e-05, + "loss": 0.0035, + "step": 4185 + }, + { + "epoch": 1.1049370509524752, + "grad_norm": 0.34513652324676514, + "learning_rate": 8.956326692175749e-05, + "loss": 0.0036, + "step": 4190 + }, + { + "epoch": 1.1062553556126822, + "grad_norm": 0.283669650554657, + "learning_rate": 8.943132339358755e-05, + "loss": 0.0182, + "step": 4195 + }, + { + "epoch": 1.1075736602728892, + "grad_norm": 0.5376952290534973, + "learning_rate": 8.92993798654176e-05, + "loss": 0.0293, + "step": 4200 + }, + { + "epoch": 1.108891964933096, + "grad_norm": 0.01689724065363407, + "learning_rate": 8.916743633724767e-05, + "loss": 0.0206, + "step": 4205 + }, + { + "epoch": 1.110210269593303, + "grad_norm": 0.026538770645856857, + "learning_rate": 8.903549280907772e-05, + "loss": 0.0181, + "step": 4210 + }, + { + "epoch": 1.11152857425351, + "grad_norm": 0.6372873783111572, + "learning_rate": 8.890354928090777e-05, + "loss": 0.021, + "step": 4215 + }, + { + "epoch": 1.112846878913717, + "grad_norm": 0.06177428737282753, + "learning_rate": 8.877160575273783e-05, + "loss": 0.0033, + "step": 4220 + }, + { + "epoch": 1.114165183573924, + "grad_norm": 0.3712109923362732, + "learning_rate": 8.863966222456788e-05, + "loss": 0.0075, + "step": 4225 + }, + { + "epoch": 1.115483488234131, + "grad_norm": 0.030514653772115707, + "learning_rate": 8.850771869639795e-05, + "loss": 0.0183, + "step": 4230 + }, + { + "epoch": 1.116801792894338, + "grad_norm": 0.012861707247793674, + "learning_rate": 8.837577516822801e-05, + "loss": 0.0032, + "step": 4235 + }, + { + "epoch": 1.118120097554545, + "grad_norm": 0.3278522789478302, + "learning_rate": 8.824383164005806e-05, + "loss": 0.0058, + "step": 4240 + }, + { + "epoch": 1.1194384022147519, + "grad_norm": 0.580259382724762, + "learning_rate": 8.811188811188812e-05, + "loss": 0.0068, + "step": 4245 + }, + { + "epoch": 1.1207567068749589, + "grad_norm": 0.007002575788646936, + "learning_rate": 8.797994458371817e-05, + "loss": 0.0063, + "step": 4250 + }, + { + "epoch": 1.1220750115351659, + "grad_norm": 0.22484643757343292, + "learning_rate": 8.784800105554823e-05, + "loss": 0.0167, + "step": 4255 + }, + { + "epoch": 1.1233933161953726, + "grad_norm": 0.004122686106711626, + "learning_rate": 8.771605752737829e-05, + "loss": 0.002, + "step": 4260 + }, + { + "epoch": 1.1247116208555796, + "grad_norm": 0.009832561016082764, + "learning_rate": 8.758411399920834e-05, + "loss": 0.0029, + "step": 4265 + }, + { + "epoch": 1.1260299255157866, + "grad_norm": 0.04854527860879898, + "learning_rate": 8.74521704710384e-05, + "loss": 0.0068, + "step": 4270 + }, + { + "epoch": 1.1273482301759936, + "grad_norm": 0.12221235036849976, + "learning_rate": 8.732022694286845e-05, + "loss": 0.003, + "step": 4275 + }, + { + "epoch": 1.1286665348362006, + "grad_norm": 0.005857539363205433, + "learning_rate": 8.718828341469851e-05, + "loss": 0.0022, + "step": 4280 + }, + { + "epoch": 1.1299848394964076, + "grad_norm": 0.10582758486270905, + "learning_rate": 8.705633988652856e-05, + "loss": 0.002, + "step": 4285 + }, + { + "epoch": 1.1313031441566146, + "grad_norm": 0.006190940272063017, + "learning_rate": 8.692439635835863e-05, + "loss": 0.0022, + "step": 4290 + }, + { + "epoch": 1.1326214488168216, + "grad_norm": 0.00221514655277133, + "learning_rate": 8.679245283018869e-05, + "loss": 0.0314, + "step": 4295 + }, + { + "epoch": 1.1339397534770286, + "grad_norm": 0.0796755850315094, + "learning_rate": 8.666050930201874e-05, + "loss": 0.0347, + "step": 4300 + }, + { + "epoch": 1.1352580581372356, + "grad_norm": 0.20088806748390198, + "learning_rate": 8.65285657738488e-05, + "loss": 0.0048, + "step": 4305 + }, + { + "epoch": 1.1365763627974426, + "grad_norm": 0.4018377363681793, + "learning_rate": 8.639662224567884e-05, + "loss": 0.0234, + "step": 4310 + }, + { + "epoch": 1.1378946674576496, + "grad_norm": 0.014961684122681618, + "learning_rate": 8.626467871750891e-05, + "loss": 0.0033, + "step": 4315 + }, + { + "epoch": 1.1392129721178565, + "grad_norm": 0.004534922540187836, + "learning_rate": 8.613273518933897e-05, + "loss": 0.0021, + "step": 4320 + }, + { + "epoch": 1.1405312767780633, + "grad_norm": 0.06340984255075455, + "learning_rate": 8.600079166116902e-05, + "loss": 0.0538, + "step": 4325 + }, + { + "epoch": 1.1418495814382703, + "grad_norm": 0.007374623324722052, + "learning_rate": 8.586884813299908e-05, + "loss": 0.0157, + "step": 4330 + }, + { + "epoch": 1.1431678860984773, + "grad_norm": 0.02313193492591381, + "learning_rate": 8.573690460482913e-05, + "loss": 0.0307, + "step": 4335 + }, + { + "epoch": 1.1444861907586843, + "grad_norm": 0.014071634039282799, + "learning_rate": 8.560496107665919e-05, + "loss": 0.0058, + "step": 4340 + }, + { + "epoch": 1.1458044954188913, + "grad_norm": 1.4664901494979858, + "learning_rate": 8.547301754848926e-05, + "loss": 0.0566, + "step": 4345 + }, + { + "epoch": 1.1471228000790983, + "grad_norm": 0.023680074140429497, + "learning_rate": 8.534107402031931e-05, + "loss": 0.0048, + "step": 4350 + }, + { + "epoch": 1.1484411047393053, + "grad_norm": 0.012555698864161968, + "learning_rate": 8.520913049214937e-05, + "loss": 0.0076, + "step": 4355 + }, + { + "epoch": 1.1497594093995123, + "grad_norm": 0.013624129816889763, + "learning_rate": 8.507718696397941e-05, + "loss": 0.0373, + "step": 4360 + }, + { + "epoch": 1.1510777140597193, + "grad_norm": 0.015372387133538723, + "learning_rate": 8.494524343580947e-05, + "loss": 0.0147, + "step": 4365 + }, + { + "epoch": 1.1523960187199263, + "grad_norm": 0.3312993347644806, + "learning_rate": 8.481329990763954e-05, + "loss": 0.0299, + "step": 4370 + }, + { + "epoch": 1.1537143233801332, + "grad_norm": 0.023838184773921967, + "learning_rate": 8.468135637946959e-05, + "loss": 0.0226, + "step": 4375 + }, + { + "epoch": 1.15503262804034, + "grad_norm": 0.42516952753067017, + "learning_rate": 8.454941285129965e-05, + "loss": 0.0088, + "step": 4380 + }, + { + "epoch": 1.156350932700547, + "grad_norm": 0.6900278925895691, + "learning_rate": 8.44174693231297e-05, + "loss": 0.0245, + "step": 4385 + }, + { + "epoch": 1.157669237360754, + "grad_norm": 0.2932703197002411, + "learning_rate": 8.428552579495976e-05, + "loss": 0.0207, + "step": 4390 + }, + { + "epoch": 1.158987542020961, + "grad_norm": 0.12942780554294586, + "learning_rate": 8.415358226678982e-05, + "loss": 0.0037, + "step": 4395 + }, + { + "epoch": 1.160305846681168, + "grad_norm": 0.9499046802520752, + "learning_rate": 8.402163873861989e-05, + "loss": 0.0246, + "step": 4400 + }, + { + "epoch": 1.161624151341375, + "grad_norm": 0.008869118988513947, + "learning_rate": 8.388969521044994e-05, + "loss": 0.0171, + "step": 4405 + }, + { + "epoch": 1.162942456001582, + "grad_norm": 1.7409231662750244, + "learning_rate": 8.375775168227998e-05, + "loss": 0.017, + "step": 4410 + }, + { + "epoch": 1.164260760661789, + "grad_norm": 0.0020101398695260286, + "learning_rate": 8.362580815411004e-05, + "loss": 0.0027, + "step": 4415 + }, + { + "epoch": 1.165579065321996, + "grad_norm": 0.0785067081451416, + "learning_rate": 8.34938646259401e-05, + "loss": 0.0043, + "step": 4420 + }, + { + "epoch": 1.166897369982203, + "grad_norm": 0.0029506285209208727, + "learning_rate": 8.336192109777016e-05, + "loss": 0.0109, + "step": 4425 + }, + { + "epoch": 1.16821567464241, + "grad_norm": 0.02216683328151703, + "learning_rate": 8.322997756960022e-05, + "loss": 0.0026, + "step": 4430 + }, + { + "epoch": 1.1695339793026167, + "grad_norm": 0.02216639369726181, + "learning_rate": 8.309803404143027e-05, + "loss": 0.0045, + "step": 4435 + }, + { + "epoch": 1.170852283962824, + "grad_norm": 0.0, + "learning_rate": 8.296609051326033e-05, + "loss": 0.006, + "step": 4440 + }, + { + "epoch": 1.1721705886230307, + "grad_norm": 0.0019736960530281067, + "learning_rate": 8.283414698509039e-05, + "loss": 0.0078, + "step": 4445 + }, + { + "epoch": 1.1734888932832377, + "grad_norm": 0.012957746163010597, + "learning_rate": 8.270220345692044e-05, + "loss": 0.002, + "step": 4450 + }, + { + "epoch": 1.1748071979434447, + "grad_norm": 0.010877869091928005, + "learning_rate": 8.25702599287505e-05, + "loss": 0.0237, + "step": 4455 + }, + { + "epoch": 1.1761255026036517, + "grad_norm": 0.005947659723460674, + "learning_rate": 8.243831640058055e-05, + "loss": 0.0341, + "step": 4460 + }, + { + "epoch": 1.1774438072638587, + "grad_norm": 0.0005026470171287656, + "learning_rate": 8.230637287241061e-05, + "loss": 0.0033, + "step": 4465 + }, + { + "epoch": 1.1787621119240657, + "grad_norm": 0.022054588422179222, + "learning_rate": 8.217442934424066e-05, + "loss": 0.0042, + "step": 4470 + }, + { + "epoch": 1.1800804165842727, + "grad_norm": 0.7929030656814575, + "learning_rate": 8.204248581607072e-05, + "loss": 0.0076, + "step": 4475 + }, + { + "epoch": 1.1813987212444796, + "grad_norm": 0.39052629470825195, + "learning_rate": 8.191054228790078e-05, + "loss": 0.0228, + "step": 4480 + }, + { + "epoch": 1.1827170259046866, + "grad_norm": 0.007177622988820076, + "learning_rate": 8.177859875973084e-05, + "loss": 0.01, + "step": 4485 + }, + { + "epoch": 1.1840353305648936, + "grad_norm": 0.006175135262310505, + "learning_rate": 8.16466552315609e-05, + "loss": 0.0037, + "step": 4490 + }, + { + "epoch": 1.1853536352251006, + "grad_norm": 0.0356481671333313, + "learning_rate": 8.151471170339096e-05, + "loss": 0.0024, + "step": 4495 + }, + { + "epoch": 1.1866719398853074, + "grad_norm": 0.19069480895996094, + "learning_rate": 8.138276817522101e-05, + "loss": 0.0048, + "step": 4500 + }, + { + "epoch": 1.1866719398853074, + "eval_loss": 0.026386437937617302, + "eval_runtime": 452.2896, + "eval_samples_per_second": 7.455, + "eval_steps_per_second": 3.728, + "step": 4500 + } + ], + "logging_steps": 5, + "max_steps": 7584, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.632132597925089e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4500/training_args.bin b/checkpoint-4500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4c05e0585b1aef03c0cbe4c50207ce04940bd838 --- /dev/null +++ b/checkpoint-4500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79dfa687fdd0c9908ab6b63535817e7567b29b0b483ac228723218f6f5fdeec5 +size 5688 diff --git a/checkpoint-500/README.md b/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e738b5eda9883f4a45efd9d37b8b8357ba758456 --- /dev/null +++ b/checkpoint-500/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/Llama-3.2-3B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-500/adapter_config.json b/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..083327e0c9f3dd1a9eda2bc8b558f75005b93594 --- /dev/null +++ b/checkpoint-500/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/Llama-3.2-3B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "q_proj", + "v_proj", + "k_proj", + "down_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-500/adapter_model.safetensors b/checkpoint-500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b9c302a9481b7d499e77413ca7d9923c0023fcb8 --- /dev/null +++ b/checkpoint-500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:004b48f3a7340b91f88f4afc7b45791f4b444dfa715ebbb9251a9fa0ae441800 +size 48680136 diff --git a/checkpoint-500/optimizer.pt b/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..10d057fbc5a7d03b22866c9f72f69fec86f85c4f --- /dev/null +++ b/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e166175066e2c1847bee1dde7c2277512d77ef8a2fd456ab0eabb03ed7ea8e8 +size 49846644 diff --git a/checkpoint-500/rng_state.pth b/checkpoint-500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9757e1cbf2866774512bec82f94e17e9fb03cd6 --- /dev/null +++ b/checkpoint-500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d7ecf15e83ac4d18e0d90f8a44821af2f304313a6ae05eeb21767226a79c463 +size 14244 diff --git a/checkpoint-500/scheduler.pt b/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d75821f03abb792756efc670fc16c1559912243b --- /dev/null +++ b/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:924c9372f0d5c6820159961ab7e3d3d48df0fcbba9b71ecb6e4d0603a9407541 +size 1064 diff --git a/checkpoint-500/special_tokens_map.json b/checkpoint-500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-500/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-500/tokenizer.json b/checkpoint-500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-500/tokenizer_config.json b/checkpoint-500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b505f7771010e0b872d94ddb23916f0e9ebe5d6f --- /dev/null +++ b/checkpoint-500/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 July 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": null +} diff --git a/checkpoint-500/trainer_state.json b/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7865a3f25482b304eb50b72c7c7e660549896ba2 --- /dev/null +++ b/checkpoint-500/trainer_state.json @@ -0,0 +1,742 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1318304660206974, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001318304660206974, + "grad_norm": 4.59375, + "learning_rate": 0.0002, + "loss": 1.9624, + "step": 5 + }, + { + "epoch": 0.002636609320413948, + "grad_norm": 1.7421875, + "learning_rate": 0.00019986805647183008, + "loss": 0.6513, + "step": 10 + }, + { + "epoch": 0.003954913980620921, + "grad_norm": 1.84375, + "learning_rate": 0.00019973611294366012, + "loss": 0.1146, + "step": 15 + }, + { + "epoch": 0.005273218640827896, + "grad_norm": 1.3203125, + "learning_rate": 0.0001996041694154902, + "loss": 0.0529, + "step": 20 + }, + { + "epoch": 0.006591523301034869, + "grad_norm": 0.40234375, + "learning_rate": 0.00019947222588732023, + "loss": 0.1214, + "step": 25 + }, + { + "epoch": 0.007909827961241843, + "grad_norm": 1.5390625, + "learning_rate": 0.0001993402823591503, + "loss": 0.0919, + "step": 30 + }, + { + "epoch": 0.009228132621448816, + "grad_norm": 0.06201171875, + "learning_rate": 0.00019920833883098034, + "loss": 0.09, + "step": 35 + }, + { + "epoch": 0.010546437281655791, + "grad_norm": 1.53125, + "learning_rate": 0.0001990763953028104, + "loss": 0.1945, + "step": 40 + }, + { + "epoch": 0.011864741941862765, + "grad_norm": 0.2890625, + "learning_rate": 0.00019894445177464048, + "loss": 0.1259, + "step": 45 + }, + { + "epoch": 0.013183046602069738, + "grad_norm": 0.609375, + "learning_rate": 0.00019881250824647052, + "loss": 0.027, + "step": 50 + }, + { + "epoch": 0.014501351262276712, + "grad_norm": 0.369140625, + "learning_rate": 0.00019868056471830057, + "loss": 0.1068, + "step": 55 + }, + { + "epoch": 0.015819655922483685, + "grad_norm": 0.34765625, + "learning_rate": 0.00019854862119013064, + "loss": 0.0542, + "step": 60 + }, + { + "epoch": 0.01713796058269066, + "grad_norm": 0.055419921875, + "learning_rate": 0.00019841667766196068, + "loss": 0.0901, + "step": 65 + }, + { + "epoch": 0.018456265242897632, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00019828473413379075, + "loss": 0.0091, + "step": 70 + }, + { + "epoch": 0.019774569903104607, + "grad_norm": 0.0079345703125, + "learning_rate": 0.0001981527906056208, + "loss": 0.0744, + "step": 75 + }, + { + "epoch": 0.021092874563311582, + "grad_norm": 0.65234375, + "learning_rate": 0.00019802084707745086, + "loss": 0.1108, + "step": 80 + }, + { + "epoch": 0.022411179223518554, + "grad_norm": 0.50390625, + "learning_rate": 0.0001978889035492809, + "loss": 0.0446, + "step": 85 + }, + { + "epoch": 0.02372948388372553, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019775696002111097, + "loss": 0.0982, + "step": 90 + }, + { + "epoch": 0.0250477885439325, + "grad_norm": 0.490234375, + "learning_rate": 0.00019762501649294104, + "loss": 0.1035, + "step": 95 + }, + { + "epoch": 0.026366093204139476, + "grad_norm": 0.12158203125, + "learning_rate": 0.00019749307296477108, + "loss": 0.0401, + "step": 100 + }, + { + "epoch": 0.02768439786434645, + "grad_norm": 0.16015625, + "learning_rate": 0.00019736112943660115, + "loss": 0.0309, + "step": 105 + }, + { + "epoch": 0.029002702524553423, + "grad_norm": 1.359375, + "learning_rate": 0.0001972291859084312, + "loss": 0.1032, + "step": 110 + }, + { + "epoch": 0.0303210071847604, + "grad_norm": 0.52734375, + "learning_rate": 0.00019709724238026126, + "loss": 0.0811, + "step": 115 + }, + { + "epoch": 0.03163931184496737, + "grad_norm": 0.177734375, + "learning_rate": 0.00019696529885209133, + "loss": 0.0258, + "step": 120 + }, + { + "epoch": 0.03295761650517435, + "grad_norm": 0.234375, + "learning_rate": 0.00019683335532392137, + "loss": 0.0437, + "step": 125 + }, + { + "epoch": 0.03427592116538132, + "grad_norm": 1.3046875, + "learning_rate": 0.00019670141179575144, + "loss": 0.0967, + "step": 130 + }, + { + "epoch": 0.03559422582558829, + "grad_norm": 0.2734375, + "learning_rate": 0.00019656946826758148, + "loss": 0.0132, + "step": 135 + }, + { + "epoch": 0.036912530485795264, + "grad_norm": 0.66015625, + "learning_rate": 0.00019643752473941155, + "loss": 0.0396, + "step": 140 + }, + { + "epoch": 0.03823083514600224, + "grad_norm": 1.0546875, + "learning_rate": 0.0001963055812112416, + "loss": 0.0449, + "step": 145 + }, + { + "epoch": 0.039549139806209214, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019617363768307166, + "loss": 0.1196, + "step": 150 + }, + { + "epoch": 0.040867444466416186, + "grad_norm": 0.5859375, + "learning_rate": 0.0001960416941549017, + "loss": 0.0588, + "step": 155 + }, + { + "epoch": 0.042185749126623165, + "grad_norm": 0.06005859375, + "learning_rate": 0.00019590975062673175, + "loss": 0.0234, + "step": 160 + }, + { + "epoch": 0.04350405378683014, + "grad_norm": 0.4921875, + "learning_rate": 0.00019577780709856182, + "loss": 0.0916, + "step": 165 + }, + { + "epoch": 0.04482235844703711, + "grad_norm": 0.84375, + "learning_rate": 0.0001956458635703919, + "loss": 0.0271, + "step": 170 + }, + { + "epoch": 0.04614066310724409, + "grad_norm": 0.8828125, + "learning_rate": 0.00019551392004222193, + "loss": 0.0175, + "step": 175 + }, + { + "epoch": 0.04745896776745106, + "grad_norm": 0.0152587890625, + "learning_rate": 0.000195381976514052, + "loss": 0.0356, + "step": 180 + }, + { + "epoch": 0.04877727242765803, + "grad_norm": 0.09326171875, + "learning_rate": 0.00019525003298588204, + "loss": 0.0057, + "step": 185 + }, + { + "epoch": 0.050095577087865, + "grad_norm": 0.24609375, + "learning_rate": 0.0001951180894577121, + "loss": 0.0082, + "step": 190 + }, + { + "epoch": 0.05141388174807198, + "grad_norm": 0.05029296875, + "learning_rate": 0.00019498614592954215, + "loss": 0.0178, + "step": 195 + }, + { + "epoch": 0.05273218640827895, + "grad_norm": 0.0390625, + "learning_rate": 0.00019485420240137222, + "loss": 0.0789, + "step": 200 + }, + { + "epoch": 0.054050491068485924, + "grad_norm": 0.5625, + "learning_rate": 0.0001947222588732023, + "loss": 0.0645, + "step": 205 + }, + { + "epoch": 0.0553687957286929, + "grad_norm": 0.53515625, + "learning_rate": 0.00019459031534503233, + "loss": 0.116, + "step": 210 + }, + { + "epoch": 0.056687100388899875, + "grad_norm": 0.55078125, + "learning_rate": 0.0001944583718168624, + "loss": 0.0516, + "step": 215 + }, + { + "epoch": 0.058005405049106847, + "grad_norm": 0.314453125, + "learning_rate": 0.00019432642828869244, + "loss": 0.1019, + "step": 220 + }, + { + "epoch": 0.059323709709313825, + "grad_norm": 0.1123046875, + "learning_rate": 0.0001941944847605225, + "loss": 0.0529, + "step": 225 + }, + { + "epoch": 0.0606420143695208, + "grad_norm": 0.4921875, + "learning_rate": 0.00019406254123235256, + "loss": 0.0368, + "step": 230 + }, + { + "epoch": 0.06196031902972777, + "grad_norm": 0.054443359375, + "learning_rate": 0.00019393059770418262, + "loss": 0.037, + "step": 235 + }, + { + "epoch": 0.06327862368993474, + "grad_norm": 0.008544921875, + "learning_rate": 0.0001937986541760127, + "loss": 0.0324, + "step": 240 + }, + { + "epoch": 0.06459692835014172, + "grad_norm": 1.5, + "learning_rate": 0.00019366671064784274, + "loss": 0.0334, + "step": 245 + }, + { + "epoch": 0.0659152330103487, + "grad_norm": 0.2109375, + "learning_rate": 0.0001935347671196728, + "loss": 0.0671, + "step": 250 + }, + { + "epoch": 0.06723353767055566, + "grad_norm": 2.0625, + "learning_rate": 0.00019340282359150285, + "loss": 0.1559, + "step": 255 + }, + { + "epoch": 0.06855184233076264, + "grad_norm": 0.7734375, + "learning_rate": 0.0001932708800633329, + "loss": 0.0198, + "step": 260 + }, + { + "epoch": 0.06987014699096962, + "grad_norm": 0.42578125, + "learning_rate": 0.00019313893653516296, + "loss": 0.0151, + "step": 265 + }, + { + "epoch": 0.07118845165117658, + "grad_norm": 0.1884765625, + "learning_rate": 0.000193006993006993, + "loss": 0.0269, + "step": 270 + }, + { + "epoch": 0.07250675631138356, + "grad_norm": 1.546875, + "learning_rate": 0.00019287504947882307, + "loss": 0.0565, + "step": 275 + }, + { + "epoch": 0.07382506097159053, + "grad_norm": 0.5078125, + "learning_rate": 0.0001927431059506531, + "loss": 0.0942, + "step": 280 + }, + { + "epoch": 0.0751433656317975, + "grad_norm": 0.392578125, + "learning_rate": 0.00019261116242248318, + "loss": 0.0061, + "step": 285 + }, + { + "epoch": 0.07646167029200449, + "grad_norm": 1.9140625, + "learning_rate": 0.00019247921889431325, + "loss": 0.0497, + "step": 290 + }, + { + "epoch": 0.07777997495221145, + "grad_norm": 0.08837890625, + "learning_rate": 0.0001923472753661433, + "loss": 0.0573, + "step": 295 + }, + { + "epoch": 0.07909827961241843, + "grad_norm": 1.046875, + "learning_rate": 0.00019221533183797336, + "loss": 0.0528, + "step": 300 + }, + { + "epoch": 0.08041658427262541, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001920833883098034, + "loss": 0.0506, + "step": 305 + }, + { + "epoch": 0.08173488893283237, + "grad_norm": 0.08203125, + "learning_rate": 0.00019195144478163347, + "loss": 0.0307, + "step": 310 + }, + { + "epoch": 0.08305319359303935, + "grad_norm": 0.111328125, + "learning_rate": 0.00019181950125346354, + "loss": 0.0365, + "step": 315 + }, + { + "epoch": 0.08437149825324633, + "grad_norm": 1.2890625, + "learning_rate": 0.00019168755772529358, + "loss": 0.0447, + "step": 320 + }, + { + "epoch": 0.0856898029134533, + "grad_norm": 0.6015625, + "learning_rate": 0.00019155561419712365, + "loss": 0.0605, + "step": 325 + }, + { + "epoch": 0.08700810757366027, + "grad_norm": 0.71875, + "learning_rate": 0.0001914236706689537, + "loss": 0.0846, + "step": 330 + }, + { + "epoch": 0.08832641223386725, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019129172714078376, + "loss": 0.0713, + "step": 335 + }, + { + "epoch": 0.08964471689407422, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001911597836126138, + "loss": 0.0826, + "step": 340 + }, + { + "epoch": 0.0909630215542812, + "grad_norm": 2.203125, + "learning_rate": 0.00019102784008444388, + "loss": 0.0441, + "step": 345 + }, + { + "epoch": 0.09228132621448817, + "grad_norm": 1.21875, + "learning_rate": 0.00019089589655627395, + "loss": 0.1378, + "step": 350 + }, + { + "epoch": 0.09359963087469514, + "grad_norm": 3.0625, + "learning_rate": 0.00019076395302810396, + "loss": 0.1552, + "step": 355 + }, + { + "epoch": 0.09491793553490212, + "grad_norm": 0.232421875, + "learning_rate": 0.00019063200949993403, + "loss": 0.0458, + "step": 360 + }, + { + "epoch": 0.0962362401951091, + "grad_norm": 0.71875, + "learning_rate": 0.0001905000659717641, + "loss": 0.0312, + "step": 365 + }, + { + "epoch": 0.09755454485531606, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00019036812244359414, + "loss": 0.0247, + "step": 370 + }, + { + "epoch": 0.09887284951552304, + "grad_norm": 0.064453125, + "learning_rate": 0.0001902361789154242, + "loss": 0.054, + "step": 375 + }, + { + "epoch": 0.10019115417573, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019010423538725425, + "loss": 0.0023, + "step": 380 + }, + { + "epoch": 0.10150945883593698, + "grad_norm": 0.0361328125, + "learning_rate": 0.00018997229185908432, + "loss": 0.0884, + "step": 385 + }, + { + "epoch": 0.10282776349614396, + "grad_norm": 1.703125, + "learning_rate": 0.00018984034833091436, + "loss": 0.0506, + "step": 390 + }, + { + "epoch": 0.10414606815635093, + "grad_norm": 0.08837890625, + "learning_rate": 0.00018970840480274443, + "loss": 0.1123, + "step": 395 + }, + { + "epoch": 0.1054643728165579, + "grad_norm": 0.6953125, + "learning_rate": 0.0001895764612745745, + "loss": 0.0597, + "step": 400 + }, + { + "epoch": 0.10678267747676488, + "grad_norm": 0.18359375, + "learning_rate": 0.00018944451774640454, + "loss": 0.0138, + "step": 405 + }, + { + "epoch": 0.10810098213697185, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001893125742182346, + "loss": 0.0249, + "step": 410 + }, + { + "epoch": 0.10941928679717883, + "grad_norm": 0.00970458984375, + "learning_rate": 0.00018918063069006466, + "loss": 0.0084, + "step": 415 + }, + { + "epoch": 0.1107375914573858, + "grad_norm": 0.54296875, + "learning_rate": 0.00018904868716189472, + "loss": 0.0541, + "step": 420 + }, + { + "epoch": 0.11205589611759277, + "grad_norm": 0.74609375, + "learning_rate": 0.00018891674363372477, + "loss": 0.007, + "step": 425 + }, + { + "epoch": 0.11337420077779975, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00018878480010555484, + "loss": 0.0875, + "step": 430 + }, + { + "epoch": 0.11469250543800673, + "grad_norm": 0.9296875, + "learning_rate": 0.0001886528565773849, + "loss": 0.1207, + "step": 435 + }, + { + "epoch": 0.11601081009821369, + "grad_norm": 1.2734375, + "learning_rate": 0.00018852091304921495, + "loss": 0.1143, + "step": 440 + }, + { + "epoch": 0.11732911475842067, + "grad_norm": 0.6484375, + "learning_rate": 0.00018838896952104502, + "loss": 0.0393, + "step": 445 + }, + { + "epoch": 0.11864741941862765, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018825702599287506, + "loss": 0.02, + "step": 450 + }, + { + "epoch": 0.11996572407883462, + "grad_norm": 0.486328125, + "learning_rate": 0.0001881250824647051, + "loss": 0.0891, + "step": 455 + }, + { + "epoch": 0.1212840287390416, + "grad_norm": 1.0, + "learning_rate": 0.00018799313893653517, + "loss": 0.0469, + "step": 460 + }, + { + "epoch": 0.12260233339924857, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001878611954083652, + "loss": 0.019, + "step": 465 + }, + { + "epoch": 0.12392063805945554, + "grad_norm": 0.03857421875, + "learning_rate": 0.00018772925188019528, + "loss": 0.007, + "step": 470 + }, + { + "epoch": 0.12523894271966252, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00018759730835202532, + "loss": 0.0039, + "step": 475 + }, + { + "epoch": 0.12655724737986948, + "grad_norm": 0.014404296875, + "learning_rate": 0.0001874653648238554, + "loss": 0.0043, + "step": 480 + }, + { + "epoch": 0.12787555204007647, + "grad_norm": 0.51953125, + "learning_rate": 0.00018733342129568546, + "loss": 0.1326, + "step": 485 + }, + { + "epoch": 0.12919385670028344, + "grad_norm": 0.99609375, + "learning_rate": 0.0001872014777675155, + "loss": 0.0369, + "step": 490 + }, + { + "epoch": 0.1305121613604904, + "grad_norm": 0.2734375, + "learning_rate": 0.00018706953423934557, + "loss": 0.0395, + "step": 495 + }, + { + "epoch": 0.1318304660206974, + "grad_norm": 0.083984375, + "learning_rate": 0.00018693759071117561, + "loss": 0.0284, + "step": 500 + }, + { + "epoch": 0.1318304660206974, + "eval_loss": 0.04542969539761543, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.5293, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 3.685, + "step": 500 + } + ], + "logging_steps": 5, + "max_steps": 7584, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.034358497687962e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-500/training_args.bin b/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d172b090b85619dc1adf130a66b333f6491d2342 --- /dev/null +++ b/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cee8b68ab8cbec1968c4eec747022ea57cd350072097ab37179dd29309655d7 +size 5688 diff --git a/checkpoint-5000/README.md b/checkpoint-5000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e738b5eda9883f4a45efd9d37b8b8357ba758456 --- /dev/null +++ b/checkpoint-5000/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/Llama-3.2-3B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-5000/adapter_config.json b/checkpoint-5000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..852d02d4dc1204f4332be8b3363041a3d3e3feb3 --- /dev/null +++ b/checkpoint-5000/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/Llama-3.2-3B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "gate_proj", + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-5000/adapter_model.safetensors b/checkpoint-5000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3aea347e198023dbe9af6d8c05be24a01d6fcfa8 --- /dev/null +++ b/checkpoint-5000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8801668404d375a8d412542e2d120cafc0c8bad8a054dfe64f44ded0579c6654 +size 97307544 diff --git a/checkpoint-5000/optimizer.pt b/checkpoint-5000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..82a01ebde87b81dc01cb824fa08c425b58acc6fe --- /dev/null +++ b/checkpoint-5000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7be5cc7992dfe3eb226ea59b594f5e7e564e808e5cb5edaf8ce8ffbab1bbecb6 +size 50866370 diff --git a/checkpoint-5000/rng_state.pth b/checkpoint-5000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2f0a4cb7f4460e86c8164621bd5f70684a8b0d47 --- /dev/null +++ b/checkpoint-5000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ff510f6f9cc84844f5ad0696d969584791b77ac1ac56b5ca94efb4aa324eeff +size 14244 diff --git a/checkpoint-5000/scheduler.pt b/checkpoint-5000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ed0d65e1b2f9bcfa58f20f56cc7a0e342edcb19 --- /dev/null +++ b/checkpoint-5000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70d7b236eb06334ad224f214fed61e6762ae675bbe0b2796aaf765f41f333fa3 +size 1064 diff --git a/checkpoint-5000/special_tokens_map.json b/checkpoint-5000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-5000/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-5000/tokenizer.json b/checkpoint-5000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-5000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-5000/tokenizer_config.json b/checkpoint-5000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfd835b0ef1033e89629af6e13ae45397e9fa2f8 --- /dev/null +++ b/checkpoint-5000/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 July 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-5000/trainer_state.json b/checkpoint-5000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e7c5e874487e06df0a8a431badc6f795279f1375 --- /dev/null +++ b/checkpoint-5000/trainer_state.json @@ -0,0 +1,7116 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.3185024059060049, + "eval_steps": 500, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001318304660206974, + "grad_norm": 4.59375, + "learning_rate": 0.0002, + "loss": 1.9624, + "step": 5 + }, + { + "epoch": 0.002636609320413948, + "grad_norm": 1.7421875, + "learning_rate": 0.00019986805647183008, + "loss": 0.6513, + "step": 10 + }, + { + "epoch": 0.003954913980620921, + "grad_norm": 1.84375, + "learning_rate": 0.00019973611294366012, + "loss": 0.1146, + "step": 15 + }, + { + "epoch": 0.005273218640827896, + "grad_norm": 1.3203125, + "learning_rate": 0.0001996041694154902, + "loss": 0.0529, + "step": 20 + }, + { + "epoch": 0.006591523301034869, + "grad_norm": 0.40234375, + "learning_rate": 0.00019947222588732023, + "loss": 0.1214, + "step": 25 + }, + { + "epoch": 0.007909827961241843, + "grad_norm": 1.5390625, + "learning_rate": 0.0001993402823591503, + "loss": 0.0919, + "step": 30 + }, + { + "epoch": 0.009228132621448816, + "grad_norm": 0.06201171875, + "learning_rate": 0.00019920833883098034, + "loss": 0.09, + "step": 35 + }, + { + "epoch": 0.010546437281655791, + "grad_norm": 1.53125, + "learning_rate": 0.0001990763953028104, + "loss": 0.1945, + "step": 40 + }, + { + "epoch": 0.011864741941862765, + "grad_norm": 0.2890625, + "learning_rate": 0.00019894445177464048, + "loss": 0.1259, + "step": 45 + }, + { + "epoch": 0.013183046602069738, + "grad_norm": 0.609375, + "learning_rate": 0.00019881250824647052, + "loss": 0.027, + "step": 50 + }, + { + "epoch": 0.014501351262276712, + "grad_norm": 0.369140625, + "learning_rate": 0.00019868056471830057, + "loss": 0.1068, + "step": 55 + }, + { + "epoch": 0.015819655922483685, + "grad_norm": 0.34765625, + "learning_rate": 0.00019854862119013064, + "loss": 0.0542, + "step": 60 + }, + { + "epoch": 0.01713796058269066, + "grad_norm": 0.055419921875, + "learning_rate": 0.00019841667766196068, + "loss": 0.0901, + "step": 65 + }, + { + "epoch": 0.018456265242897632, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00019828473413379075, + "loss": 0.0091, + "step": 70 + }, + { + "epoch": 0.019774569903104607, + "grad_norm": 0.0079345703125, + "learning_rate": 0.0001981527906056208, + "loss": 0.0744, + "step": 75 + }, + { + "epoch": 0.021092874563311582, + "grad_norm": 0.65234375, + "learning_rate": 0.00019802084707745086, + "loss": 0.1108, + "step": 80 + }, + { + "epoch": 0.022411179223518554, + "grad_norm": 0.50390625, + "learning_rate": 0.0001978889035492809, + "loss": 0.0446, + "step": 85 + }, + { + "epoch": 0.02372948388372553, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019775696002111097, + "loss": 0.0982, + "step": 90 + }, + { + "epoch": 0.0250477885439325, + "grad_norm": 0.490234375, + "learning_rate": 0.00019762501649294104, + "loss": 0.1035, + "step": 95 + }, + { + "epoch": 0.026366093204139476, + "grad_norm": 0.12158203125, + "learning_rate": 0.00019749307296477108, + "loss": 0.0401, + "step": 100 + }, + { + "epoch": 0.02768439786434645, + "grad_norm": 0.16015625, + "learning_rate": 0.00019736112943660115, + "loss": 0.0309, + "step": 105 + }, + { + "epoch": 0.029002702524553423, + "grad_norm": 1.359375, + "learning_rate": 0.0001972291859084312, + "loss": 0.1032, + "step": 110 + }, + { + "epoch": 0.0303210071847604, + "grad_norm": 0.52734375, + "learning_rate": 0.00019709724238026126, + "loss": 0.0811, + "step": 115 + }, + { + "epoch": 0.03163931184496737, + "grad_norm": 0.177734375, + "learning_rate": 0.00019696529885209133, + "loss": 0.0258, + "step": 120 + }, + { + "epoch": 0.03295761650517435, + "grad_norm": 0.234375, + "learning_rate": 0.00019683335532392137, + "loss": 0.0437, + "step": 125 + }, + { + "epoch": 0.03427592116538132, + "grad_norm": 1.3046875, + "learning_rate": 0.00019670141179575144, + "loss": 0.0967, + "step": 130 + }, + { + "epoch": 0.03559422582558829, + "grad_norm": 0.2734375, + "learning_rate": 0.00019656946826758148, + "loss": 0.0132, + "step": 135 + }, + { + "epoch": 0.036912530485795264, + "grad_norm": 0.66015625, + "learning_rate": 0.00019643752473941155, + "loss": 0.0396, + "step": 140 + }, + { + "epoch": 0.03823083514600224, + "grad_norm": 1.0546875, + "learning_rate": 0.0001963055812112416, + "loss": 0.0449, + "step": 145 + }, + { + "epoch": 0.039549139806209214, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019617363768307166, + "loss": 0.1196, + "step": 150 + }, + { + "epoch": 0.040867444466416186, + "grad_norm": 0.5859375, + "learning_rate": 0.0001960416941549017, + "loss": 0.0588, + "step": 155 + }, + { + "epoch": 0.042185749126623165, + "grad_norm": 0.06005859375, + "learning_rate": 0.00019590975062673175, + "loss": 0.0234, + "step": 160 + }, + { + "epoch": 0.04350405378683014, + "grad_norm": 0.4921875, + "learning_rate": 0.00019577780709856182, + "loss": 0.0916, + "step": 165 + }, + { + "epoch": 0.04482235844703711, + "grad_norm": 0.84375, + "learning_rate": 0.0001956458635703919, + "loss": 0.0271, + "step": 170 + }, + { + "epoch": 0.04614066310724409, + "grad_norm": 0.8828125, + "learning_rate": 0.00019551392004222193, + "loss": 0.0175, + "step": 175 + }, + { + "epoch": 0.04745896776745106, + "grad_norm": 0.0152587890625, + "learning_rate": 0.000195381976514052, + "loss": 0.0356, + "step": 180 + }, + { + "epoch": 0.04877727242765803, + "grad_norm": 0.09326171875, + "learning_rate": 0.00019525003298588204, + "loss": 0.0057, + "step": 185 + }, + { + "epoch": 0.050095577087865, + "grad_norm": 0.24609375, + "learning_rate": 0.0001951180894577121, + "loss": 0.0082, + "step": 190 + }, + { + "epoch": 0.05141388174807198, + "grad_norm": 0.05029296875, + "learning_rate": 0.00019498614592954215, + "loss": 0.0178, + "step": 195 + }, + { + "epoch": 0.05273218640827895, + "grad_norm": 0.0390625, + "learning_rate": 0.00019485420240137222, + "loss": 0.0789, + "step": 200 + }, + { + "epoch": 0.054050491068485924, + "grad_norm": 0.5625, + "learning_rate": 0.0001947222588732023, + "loss": 0.0645, + "step": 205 + }, + { + "epoch": 0.0553687957286929, + "grad_norm": 0.53515625, + "learning_rate": 0.00019459031534503233, + "loss": 0.116, + "step": 210 + }, + { + "epoch": 0.056687100388899875, + "grad_norm": 0.55078125, + "learning_rate": 0.0001944583718168624, + "loss": 0.0516, + "step": 215 + }, + { + "epoch": 0.058005405049106847, + "grad_norm": 0.314453125, + "learning_rate": 0.00019432642828869244, + "loss": 0.1019, + "step": 220 + }, + { + "epoch": 0.059323709709313825, + "grad_norm": 0.1123046875, + "learning_rate": 0.0001941944847605225, + "loss": 0.0529, + "step": 225 + }, + { + "epoch": 0.0606420143695208, + "grad_norm": 0.4921875, + "learning_rate": 0.00019406254123235256, + "loss": 0.0368, + "step": 230 + }, + { + "epoch": 0.06196031902972777, + "grad_norm": 0.054443359375, + "learning_rate": 0.00019393059770418262, + "loss": 0.037, + "step": 235 + }, + { + "epoch": 0.06327862368993474, + "grad_norm": 0.008544921875, + "learning_rate": 0.0001937986541760127, + "loss": 0.0324, + "step": 240 + }, + { + "epoch": 0.06459692835014172, + "grad_norm": 1.5, + "learning_rate": 0.00019366671064784274, + "loss": 0.0334, + "step": 245 + }, + { + "epoch": 0.0659152330103487, + "grad_norm": 0.2109375, + "learning_rate": 0.0001935347671196728, + "loss": 0.0671, + "step": 250 + }, + { + "epoch": 0.06723353767055566, + "grad_norm": 2.0625, + "learning_rate": 0.00019340282359150285, + "loss": 0.1559, + "step": 255 + }, + { + "epoch": 0.06855184233076264, + "grad_norm": 0.7734375, + "learning_rate": 0.0001932708800633329, + "loss": 0.0198, + "step": 260 + }, + { + "epoch": 0.06987014699096962, + "grad_norm": 0.42578125, + "learning_rate": 0.00019313893653516296, + "loss": 0.0151, + "step": 265 + }, + { + "epoch": 0.07118845165117658, + "grad_norm": 0.1884765625, + "learning_rate": 0.000193006993006993, + "loss": 0.0269, + "step": 270 + }, + { + "epoch": 0.07250675631138356, + "grad_norm": 1.546875, + "learning_rate": 0.00019287504947882307, + "loss": 0.0565, + "step": 275 + }, + { + "epoch": 0.07382506097159053, + "grad_norm": 0.5078125, + "learning_rate": 0.0001927431059506531, + "loss": 0.0942, + "step": 280 + }, + { + "epoch": 0.0751433656317975, + "grad_norm": 0.392578125, + "learning_rate": 0.00019261116242248318, + "loss": 0.0061, + "step": 285 + }, + { + "epoch": 0.07646167029200449, + "grad_norm": 1.9140625, + "learning_rate": 0.00019247921889431325, + "loss": 0.0497, + "step": 290 + }, + { + "epoch": 0.07777997495221145, + "grad_norm": 0.08837890625, + "learning_rate": 0.0001923472753661433, + "loss": 0.0573, + "step": 295 + }, + { + "epoch": 0.07909827961241843, + "grad_norm": 1.046875, + "learning_rate": 0.00019221533183797336, + "loss": 0.0528, + "step": 300 + }, + { + "epoch": 0.08041658427262541, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001920833883098034, + "loss": 0.0506, + "step": 305 + }, + { + "epoch": 0.08173488893283237, + "grad_norm": 0.08203125, + "learning_rate": 0.00019195144478163347, + "loss": 0.0307, + "step": 310 + }, + { + "epoch": 0.08305319359303935, + "grad_norm": 0.111328125, + "learning_rate": 0.00019181950125346354, + "loss": 0.0365, + "step": 315 + }, + { + "epoch": 0.08437149825324633, + "grad_norm": 1.2890625, + "learning_rate": 0.00019168755772529358, + "loss": 0.0447, + "step": 320 + }, + { + "epoch": 0.0856898029134533, + "grad_norm": 0.6015625, + "learning_rate": 0.00019155561419712365, + "loss": 0.0605, + "step": 325 + }, + { + "epoch": 0.08700810757366027, + "grad_norm": 0.71875, + "learning_rate": 0.0001914236706689537, + "loss": 0.0846, + "step": 330 + }, + { + "epoch": 0.08832641223386725, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019129172714078376, + "loss": 0.0713, + "step": 335 + }, + { + "epoch": 0.08964471689407422, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001911597836126138, + "loss": 0.0826, + "step": 340 + }, + { + "epoch": 0.0909630215542812, + "grad_norm": 2.203125, + "learning_rate": 0.00019102784008444388, + "loss": 0.0441, + "step": 345 + }, + { + "epoch": 0.09228132621448817, + "grad_norm": 1.21875, + "learning_rate": 0.00019089589655627395, + "loss": 0.1378, + "step": 350 + }, + { + "epoch": 0.09359963087469514, + "grad_norm": 3.0625, + "learning_rate": 0.00019076395302810396, + "loss": 0.1552, + "step": 355 + }, + { + "epoch": 0.09491793553490212, + "grad_norm": 0.232421875, + "learning_rate": 0.00019063200949993403, + "loss": 0.0458, + "step": 360 + }, + { + "epoch": 0.0962362401951091, + "grad_norm": 0.71875, + "learning_rate": 0.0001905000659717641, + "loss": 0.0312, + "step": 365 + }, + { + "epoch": 0.09755454485531606, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00019036812244359414, + "loss": 0.0247, + "step": 370 + }, + { + "epoch": 0.09887284951552304, + "grad_norm": 0.064453125, + "learning_rate": 0.0001902361789154242, + "loss": 0.054, + "step": 375 + }, + { + "epoch": 0.10019115417573, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019010423538725425, + "loss": 0.0023, + "step": 380 + }, + { + "epoch": 0.10150945883593698, + "grad_norm": 0.0361328125, + "learning_rate": 0.00018997229185908432, + "loss": 0.0884, + "step": 385 + }, + { + "epoch": 0.10282776349614396, + "grad_norm": 1.703125, + "learning_rate": 0.00018984034833091436, + "loss": 0.0506, + "step": 390 + }, + { + "epoch": 0.10414606815635093, + "grad_norm": 0.08837890625, + "learning_rate": 0.00018970840480274443, + "loss": 0.1123, + "step": 395 + }, + { + "epoch": 0.1054643728165579, + "grad_norm": 0.6953125, + "learning_rate": 0.0001895764612745745, + "loss": 0.0597, + "step": 400 + }, + { + "epoch": 0.10678267747676488, + "grad_norm": 0.18359375, + "learning_rate": 0.00018944451774640454, + "loss": 0.0138, + "step": 405 + }, + { + "epoch": 0.10810098213697185, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001893125742182346, + "loss": 0.0249, + "step": 410 + }, + { + "epoch": 0.10941928679717883, + "grad_norm": 0.00970458984375, + "learning_rate": 0.00018918063069006466, + "loss": 0.0084, + "step": 415 + }, + { + "epoch": 0.1107375914573858, + "grad_norm": 0.54296875, + "learning_rate": 0.00018904868716189472, + "loss": 0.0541, + "step": 420 + }, + { + "epoch": 0.11205589611759277, + "grad_norm": 0.74609375, + "learning_rate": 0.00018891674363372477, + "loss": 0.007, + "step": 425 + }, + { + "epoch": 0.11337420077779975, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00018878480010555484, + "loss": 0.0875, + "step": 430 + }, + { + "epoch": 0.11469250543800673, + "grad_norm": 0.9296875, + "learning_rate": 0.0001886528565773849, + "loss": 0.1207, + "step": 435 + }, + { + "epoch": 0.11601081009821369, + "grad_norm": 1.2734375, + "learning_rate": 0.00018852091304921495, + "loss": 0.1143, + "step": 440 + }, + { + "epoch": 0.11732911475842067, + "grad_norm": 0.6484375, + "learning_rate": 0.00018838896952104502, + "loss": 0.0393, + "step": 445 + }, + { + "epoch": 0.11864741941862765, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018825702599287506, + "loss": 0.02, + "step": 450 + }, + { + "epoch": 0.11996572407883462, + "grad_norm": 0.486328125, + "learning_rate": 0.0001881250824647051, + "loss": 0.0891, + "step": 455 + }, + { + "epoch": 0.1212840287390416, + "grad_norm": 1.0, + "learning_rate": 0.00018799313893653517, + "loss": 0.0469, + "step": 460 + }, + { + "epoch": 0.12260233339924857, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001878611954083652, + "loss": 0.019, + "step": 465 + }, + { + "epoch": 0.12392063805945554, + "grad_norm": 0.03857421875, + "learning_rate": 0.00018772925188019528, + "loss": 0.007, + "step": 470 + }, + { + "epoch": 0.12523894271966252, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00018759730835202532, + "loss": 0.0039, + "step": 475 + }, + { + "epoch": 0.12655724737986948, + "grad_norm": 0.014404296875, + "learning_rate": 0.0001874653648238554, + "loss": 0.0043, + "step": 480 + }, + { + "epoch": 0.12787555204007647, + "grad_norm": 0.51953125, + "learning_rate": 0.00018733342129568546, + "loss": 0.1326, + "step": 485 + }, + { + "epoch": 0.12919385670028344, + "grad_norm": 0.99609375, + "learning_rate": 0.0001872014777675155, + "loss": 0.0369, + "step": 490 + }, + { + "epoch": 0.1305121613604904, + "grad_norm": 0.2734375, + "learning_rate": 0.00018706953423934557, + "loss": 0.0395, + "step": 495 + }, + { + "epoch": 0.1318304660206974, + "grad_norm": 0.083984375, + "learning_rate": 0.00018693759071117561, + "loss": 0.0284, + "step": 500 + }, + { + "epoch": 0.1318304660206974, + "eval_loss": 0.04542969539761543, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.5293, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 3.685, + "step": 500 + }, + { + "epoch": 0.13314877068090436, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00018680564718300568, + "loss": 0.0533, + "step": 505 + }, + { + "epoch": 0.13446707534111133, + "grad_norm": 0.71484375, + "learning_rate": 0.00018667370365483575, + "loss": 0.0183, + "step": 510 + }, + { + "epoch": 0.13578538000131832, + "grad_norm": 0.018798828125, + "learning_rate": 0.0001865417601266658, + "loss": 0.0473, + "step": 515 + }, + { + "epoch": 0.13710368466152528, + "grad_norm": 0.388671875, + "learning_rate": 0.00018640981659849586, + "loss": 0.0562, + "step": 520 + }, + { + "epoch": 0.13842198932173225, + "grad_norm": 0.77734375, + "learning_rate": 0.0001862778730703259, + "loss": 0.0755, + "step": 525 + }, + { + "epoch": 0.13974029398193924, + "grad_norm": 2.8125, + "learning_rate": 0.00018614592954215598, + "loss": 0.0422, + "step": 530 + }, + { + "epoch": 0.1410585986421462, + "grad_norm": 0.48828125, + "learning_rate": 0.00018601398601398602, + "loss": 0.0882, + "step": 535 + }, + { + "epoch": 0.14237690330235317, + "grad_norm": 0.16015625, + "learning_rate": 0.0001858820424858161, + "loss": 0.0131, + "step": 540 + }, + { + "epoch": 0.14369520796256013, + "grad_norm": 0.31640625, + "learning_rate": 0.00018575009895764616, + "loss": 0.03, + "step": 545 + }, + { + "epoch": 0.14501351262276713, + "grad_norm": 0.0120849609375, + "learning_rate": 0.0001856181554294762, + "loss": 0.0425, + "step": 550 + }, + { + "epoch": 0.1463318172829741, + "grad_norm": 0.390625, + "learning_rate": 0.00018548621190130624, + "loss": 0.011, + "step": 555 + }, + { + "epoch": 0.14765012194318106, + "grad_norm": 1.9609375, + "learning_rate": 0.0001853542683731363, + "loss": 0.0807, + "step": 560 + }, + { + "epoch": 0.14896842660338805, + "grad_norm": 0.609375, + "learning_rate": 0.00018522232484496635, + "loss": 0.0278, + "step": 565 + }, + { + "epoch": 0.150286731263595, + "grad_norm": 0.087890625, + "learning_rate": 0.00018509038131679642, + "loss": 0.0484, + "step": 570 + }, + { + "epoch": 0.15160503592380198, + "grad_norm": 0.5078125, + "learning_rate": 0.00018495843778862646, + "loss": 0.1277, + "step": 575 + }, + { + "epoch": 0.15292334058400897, + "grad_norm": 0.8125, + "learning_rate": 0.00018482649426045653, + "loss": 0.058, + "step": 580 + }, + { + "epoch": 0.15424164524421594, + "grad_norm": 0.22265625, + "learning_rate": 0.00018469455073228657, + "loss": 0.0259, + "step": 585 + }, + { + "epoch": 0.1555599499044229, + "grad_norm": 1.8984375, + "learning_rate": 0.00018456260720411664, + "loss": 0.113, + "step": 590 + }, + { + "epoch": 0.1568782545646299, + "grad_norm": 0.12451171875, + "learning_rate": 0.0001844306636759467, + "loss": 0.0312, + "step": 595 + }, + { + "epoch": 0.15819655922483686, + "grad_norm": 0.0322265625, + "learning_rate": 0.00018429872014777676, + "loss": 0.0476, + "step": 600 + }, + { + "epoch": 0.15951486388504382, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00018416677661960682, + "loss": 0.0232, + "step": 605 + }, + { + "epoch": 0.16083316854525082, + "grad_norm": 0.57421875, + "learning_rate": 0.00018403483309143687, + "loss": 0.1287, + "step": 610 + }, + { + "epoch": 0.16215147320545778, + "grad_norm": 0.765625, + "learning_rate": 0.00018390288956326694, + "loss": 0.0991, + "step": 615 + }, + { + "epoch": 0.16346977786566474, + "grad_norm": 0.3125, + "learning_rate": 0.00018377094603509698, + "loss": 0.0247, + "step": 620 + }, + { + "epoch": 0.16478808252587174, + "grad_norm": 0.37890625, + "learning_rate": 0.00018363900250692705, + "loss": 0.0632, + "step": 625 + }, + { + "epoch": 0.1661063871860787, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018350705897875712, + "loss": 0.0314, + "step": 630 + }, + { + "epoch": 0.16742469184628567, + "grad_norm": 0.0673828125, + "learning_rate": 0.00018337511545058716, + "loss": 0.0425, + "step": 635 + }, + { + "epoch": 0.16874299650649266, + "grad_norm": 0.396484375, + "learning_rate": 0.00018324317192241723, + "loss": 0.0613, + "step": 640 + }, + { + "epoch": 0.17006130116669962, + "grad_norm": 0.057373046875, + "learning_rate": 0.00018311122839424727, + "loss": 0.0569, + "step": 645 + }, + { + "epoch": 0.1713796058269066, + "grad_norm": 0.001373291015625, + "learning_rate": 0.00018297928486607734, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 0.17269791048711358, + "grad_norm": 1.0859375, + "learning_rate": 0.00018284734133790738, + "loss": 0.0189, + "step": 655 + }, + { + "epoch": 0.17401621514732055, + "grad_norm": 0.6015625, + "learning_rate": 0.00018271539780973742, + "loss": 0.0601, + "step": 660 + }, + { + "epoch": 0.1753345198075275, + "grad_norm": 0.25390625, + "learning_rate": 0.0001825834542815675, + "loss": 0.0211, + "step": 665 + }, + { + "epoch": 0.1766528244677345, + "grad_norm": 2.6875, + "learning_rate": 0.00018245151075339753, + "loss": 0.0713, + "step": 670 + }, + { + "epoch": 0.17797112912794147, + "grad_norm": 1.1875, + "learning_rate": 0.0001823195672252276, + "loss": 0.0522, + "step": 675 + }, + { + "epoch": 0.17928943378814843, + "grad_norm": 0.025146484375, + "learning_rate": 0.00018218762369705767, + "loss": 0.0242, + "step": 680 + }, + { + "epoch": 0.18060773844835543, + "grad_norm": 0.048095703125, + "learning_rate": 0.00018205568016888772, + "loss": 0.0129, + "step": 685 + }, + { + "epoch": 0.1819260431085624, + "grad_norm": 0.04541015625, + "learning_rate": 0.00018192373664071778, + "loss": 0.0142, + "step": 690 + }, + { + "epoch": 0.18324434776876936, + "grad_norm": 0.00830078125, + "learning_rate": 0.00018179179311254783, + "loss": 0.0121, + "step": 695 + }, + { + "epoch": 0.18456265242897635, + "grad_norm": 0.53125, + "learning_rate": 0.0001816598495843779, + "loss": 0.0163, + "step": 700 + }, + { + "epoch": 0.1858809570891833, + "grad_norm": 0.185546875, + "learning_rate": 0.00018152790605620796, + "loss": 0.0203, + "step": 705 + }, + { + "epoch": 0.18719926174939028, + "grad_norm": 1.2578125, + "learning_rate": 0.000181395962528038, + "loss": 0.1548, + "step": 710 + }, + { + "epoch": 0.18851756640959727, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00018126401899986808, + "loss": 0.0543, + "step": 715 + }, + { + "epoch": 0.18983587106980424, + "grad_norm": 0.07568359375, + "learning_rate": 0.00018113207547169812, + "loss": 0.0346, + "step": 720 + }, + { + "epoch": 0.1911541757300112, + "grad_norm": 0.1318359375, + "learning_rate": 0.0001810001319435282, + "loss": 0.03, + "step": 725 + }, + { + "epoch": 0.1924724803902182, + "grad_norm": 0.1455078125, + "learning_rate": 0.00018086818841535823, + "loss": 0.0796, + "step": 730 + }, + { + "epoch": 0.19379078505042516, + "grad_norm": 0.09814453125, + "learning_rate": 0.0001807362448871883, + "loss": 0.0662, + "step": 735 + }, + { + "epoch": 0.19510908971063212, + "grad_norm": 0.91015625, + "learning_rate": 0.00018060430135901837, + "loss": 0.0675, + "step": 740 + }, + { + "epoch": 0.19642739437083911, + "grad_norm": 0.10693359375, + "learning_rate": 0.0001804723578308484, + "loss": 0.0377, + "step": 745 + }, + { + "epoch": 0.19774569903104608, + "grad_norm": 0.95703125, + "learning_rate": 0.00018034041430267848, + "loss": 0.0174, + "step": 750 + }, + { + "epoch": 0.19906400369125304, + "grad_norm": 1.7890625, + "learning_rate": 0.00018020847077450852, + "loss": 0.0278, + "step": 755 + }, + { + "epoch": 0.20038230835146, + "grad_norm": 0.8515625, + "learning_rate": 0.00018007652724633856, + "loss": 0.0113, + "step": 760 + }, + { + "epoch": 0.201700613011667, + "grad_norm": 0.016845703125, + "learning_rate": 0.00017994458371816863, + "loss": 0.0589, + "step": 765 + }, + { + "epoch": 0.20301891767187397, + "grad_norm": 0.01043701171875, + "learning_rate": 0.00017981264018999867, + "loss": 0.0203, + "step": 770 + }, + { + "epoch": 0.20433722233208093, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00017968069666182874, + "loss": 0.0494, + "step": 775 + }, + { + "epoch": 0.20565552699228792, + "grad_norm": 0.56640625, + "learning_rate": 0.00017954875313365879, + "loss": 0.0394, + "step": 780 + }, + { + "epoch": 0.2069738316524949, + "grad_norm": 0.06591796875, + "learning_rate": 0.00017941680960548886, + "loss": 0.0848, + "step": 785 + }, + { + "epoch": 0.20829213631270185, + "grad_norm": 0.40234375, + "learning_rate": 0.00017928486607731892, + "loss": 0.0464, + "step": 790 + }, + { + "epoch": 0.20961044097290885, + "grad_norm": 0.06298828125, + "learning_rate": 0.00017915292254914897, + "loss": 0.0222, + "step": 795 + }, + { + "epoch": 0.2109287456331158, + "grad_norm": 0.5390625, + "learning_rate": 0.00017902097902097904, + "loss": 0.0434, + "step": 800 + }, + { + "epoch": 0.21224705029332278, + "grad_norm": 1.390625, + "learning_rate": 0.00017888903549280908, + "loss": 0.0222, + "step": 805 + }, + { + "epoch": 0.21356535495352977, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00017875709196463915, + "loss": 0.0099, + "step": 810 + }, + { + "epoch": 0.21488365961373673, + "grad_norm": 0.10009765625, + "learning_rate": 0.0001786251484364692, + "loss": 0.0086, + "step": 815 + }, + { + "epoch": 0.2162019642739437, + "grad_norm": 0.06396484375, + "learning_rate": 0.00017849320490829926, + "loss": 0.0715, + "step": 820 + }, + { + "epoch": 0.2175202689341507, + "grad_norm": 0.365234375, + "learning_rate": 0.00017836126138012933, + "loss": 0.0642, + "step": 825 + }, + { + "epoch": 0.21883857359435765, + "grad_norm": 0.01519775390625, + "learning_rate": 0.00017822931785195937, + "loss": 0.0111, + "step": 830 + }, + { + "epoch": 0.22015687825456462, + "grad_norm": 1.1640625, + "learning_rate": 0.00017809737432378944, + "loss": 0.0518, + "step": 835 + }, + { + "epoch": 0.2214751829147716, + "grad_norm": 0.00921630859375, + "learning_rate": 0.00017796543079561948, + "loss": 0.0384, + "step": 840 + }, + { + "epoch": 0.22279348757497858, + "grad_norm": 0.33984375, + "learning_rate": 0.00017783348726744955, + "loss": 0.0204, + "step": 845 + }, + { + "epoch": 0.22411179223518554, + "grad_norm": 0.294921875, + "learning_rate": 0.00017770154373927962, + "loss": 0.0075, + "step": 850 + }, + { + "epoch": 0.22543009689539253, + "grad_norm": 0.033203125, + "learning_rate": 0.00017756960021110963, + "loss": 0.0895, + "step": 855 + }, + { + "epoch": 0.2267484015555995, + "grad_norm": 0.08056640625, + "learning_rate": 0.0001774376566829397, + "loss": 0.1039, + "step": 860 + }, + { + "epoch": 0.22806670621580646, + "grad_norm": 0.55078125, + "learning_rate": 0.00017730571315476975, + "loss": 0.0125, + "step": 865 + }, + { + "epoch": 0.22938501087601346, + "grad_norm": 0.5859375, + "learning_rate": 0.00017717376962659982, + "loss": 0.0381, + "step": 870 + }, + { + "epoch": 0.23070331553622042, + "grad_norm": 0.029052734375, + "learning_rate": 0.00017704182609842988, + "loss": 0.0434, + "step": 875 + }, + { + "epoch": 0.23202162019642739, + "grad_norm": 0.43359375, + "learning_rate": 0.00017690988257025993, + "loss": 0.0799, + "step": 880 + }, + { + "epoch": 0.23333992485663438, + "grad_norm": 0.04150390625, + "learning_rate": 0.00017677793904209, + "loss": 0.0692, + "step": 885 + }, + { + "epoch": 0.23465822951684134, + "grad_norm": 0.435546875, + "learning_rate": 0.00017664599551392004, + "loss": 0.0544, + "step": 890 + }, + { + "epoch": 0.2359765341770483, + "grad_norm": 1.171875, + "learning_rate": 0.0001765140519857501, + "loss": 0.0619, + "step": 895 + }, + { + "epoch": 0.2372948388372553, + "grad_norm": 0.01263427734375, + "learning_rate": 0.00017638210845758018, + "loss": 0.0418, + "step": 900 + }, + { + "epoch": 0.23861314349746227, + "grad_norm": 0.017578125, + "learning_rate": 0.00017625016492941022, + "loss": 0.0195, + "step": 905 + }, + { + "epoch": 0.23993144815766923, + "grad_norm": 0.6171875, + "learning_rate": 0.0001761182214012403, + "loss": 0.067, + "step": 910 + }, + { + "epoch": 0.24124975281787622, + "grad_norm": 0.59765625, + "learning_rate": 0.00017598627787307033, + "loss": 0.049, + "step": 915 + }, + { + "epoch": 0.2425680574780832, + "grad_norm": 1.2421875, + "learning_rate": 0.0001758543343449004, + "loss": 0.0539, + "step": 920 + }, + { + "epoch": 0.24388636213829015, + "grad_norm": 0.10302734375, + "learning_rate": 0.00017572239081673044, + "loss": 0.0725, + "step": 925 + }, + { + "epoch": 0.24520466679849715, + "grad_norm": 0.330078125, + "learning_rate": 0.0001755904472885605, + "loss": 0.064, + "step": 930 + }, + { + "epoch": 0.2465229714587041, + "grad_norm": 0.220703125, + "learning_rate": 0.00017545850376039058, + "loss": 0.0271, + "step": 935 + }, + { + "epoch": 0.24784127611891107, + "grad_norm": 0.01470947265625, + "learning_rate": 0.00017532656023222062, + "loss": 0.0247, + "step": 940 + }, + { + "epoch": 0.24915958077911807, + "grad_norm": 0.013427734375, + "learning_rate": 0.0001751946167040507, + "loss": 0.017, + "step": 945 + }, + { + "epoch": 0.25047788543932503, + "grad_norm": 0.58984375, + "learning_rate": 0.00017506267317588073, + "loss": 0.0254, + "step": 950 + }, + { + "epoch": 0.251796190099532, + "grad_norm": 0.412109375, + "learning_rate": 0.00017493072964771078, + "loss": 0.0186, + "step": 955 + }, + { + "epoch": 0.25311449475973896, + "grad_norm": 0.66796875, + "learning_rate": 0.00017479878611954084, + "loss": 0.0617, + "step": 960 + }, + { + "epoch": 0.25443279941994595, + "grad_norm": 0.322265625, + "learning_rate": 0.00017466684259137089, + "loss": 0.0173, + "step": 965 + }, + { + "epoch": 0.25575110408015295, + "grad_norm": 0.83203125, + "learning_rate": 0.00017453489906320096, + "loss": 0.0512, + "step": 970 + }, + { + "epoch": 0.2570694087403599, + "grad_norm": 0.08447265625, + "learning_rate": 0.000174402955535031, + "loss": 0.0361, + "step": 975 + }, + { + "epoch": 0.2583877134005669, + "grad_norm": 0.423828125, + "learning_rate": 0.00017427101200686107, + "loss": 0.0175, + "step": 980 + }, + { + "epoch": 0.25970601806077387, + "grad_norm": 0.77734375, + "learning_rate": 0.00017413906847869114, + "loss": 0.0139, + "step": 985 + }, + { + "epoch": 0.2610243227209808, + "grad_norm": 0.515625, + "learning_rate": 0.00017400712495052118, + "loss": 0.0948, + "step": 990 + }, + { + "epoch": 0.2623426273811878, + "grad_norm": 1.421875, + "learning_rate": 0.00017387518142235125, + "loss": 0.0406, + "step": 995 + }, + { + "epoch": 0.2636609320413948, + "grad_norm": 0.058837890625, + "learning_rate": 0.0001737432378941813, + "loss": 0.1011, + "step": 1000 + }, + { + "epoch": 0.2636609320413948, + "eval_loss": 0.045552924275398254, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.6113, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 3.684, + "step": 1000 + }, + { + "epoch": 0.26497923670160173, + "grad_norm": 0.380859375, + "learning_rate": 0.00017361129436601136, + "loss": 0.0711, + "step": 1005 + }, + { + "epoch": 0.2662975413618087, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00017347935083784143, + "loss": 0.0218, + "step": 1010 + }, + { + "epoch": 0.2676158460220157, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017334740730967147, + "loss": 0.0301, + "step": 1015 + }, + { + "epoch": 0.26893415068222265, + "grad_norm": 0.2734375, + "learning_rate": 0.00017321546378150154, + "loss": 0.0721, + "step": 1020 + }, + { + "epoch": 0.27025245534242964, + "grad_norm": 0.25390625, + "learning_rate": 0.00017308352025333158, + "loss": 0.0363, + "step": 1025 + }, + { + "epoch": 0.27157076000263664, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017295157672516165, + "loss": 0.0313, + "step": 1030 + }, + { + "epoch": 0.2728890646628436, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0001728196331969917, + "loss": 0.0385, + "step": 1035 + }, + { + "epoch": 0.27420736932305056, + "grad_norm": 0.00787353515625, + "learning_rate": 0.00017268768966882176, + "loss": 0.0405, + "step": 1040 + }, + { + "epoch": 0.27552567398325756, + "grad_norm": 0.484375, + "learning_rate": 0.00017255574614065183, + "loss": 0.0616, + "step": 1045 + }, + { + "epoch": 0.2768439786434645, + "grad_norm": 0.0908203125, + "learning_rate": 0.00017242380261248185, + "loss": 0.0057, + "step": 1050 + }, + { + "epoch": 0.2781622833036715, + "grad_norm": 0.1904296875, + "learning_rate": 0.00017229185908431192, + "loss": 0.0417, + "step": 1055 + }, + { + "epoch": 0.2794805879638785, + "grad_norm": 0.30078125, + "learning_rate": 0.00017215991555614196, + "loss": 0.0346, + "step": 1060 + }, + { + "epoch": 0.2807988926240854, + "grad_norm": 0.016357421875, + "learning_rate": 0.00017202797202797203, + "loss": 0.0295, + "step": 1065 + }, + { + "epoch": 0.2821171972842924, + "grad_norm": 0.490234375, + "learning_rate": 0.0001718960284998021, + "loss": 0.0448, + "step": 1070 + }, + { + "epoch": 0.28343550194449935, + "grad_norm": 0.004241943359375, + "learning_rate": 0.00017176408497163214, + "loss": 0.0051, + "step": 1075 + }, + { + "epoch": 0.28475380660470634, + "grad_norm": 0.01904296875, + "learning_rate": 0.0001716321414434622, + "loss": 0.0894, + "step": 1080 + }, + { + "epoch": 0.28607211126491333, + "grad_norm": 0.83984375, + "learning_rate": 0.00017150019791529225, + "loss": 0.0288, + "step": 1085 + }, + { + "epoch": 0.28739041592512027, + "grad_norm": 0.2021484375, + "learning_rate": 0.00017136825438712232, + "loss": 0.0222, + "step": 1090 + }, + { + "epoch": 0.28870872058532726, + "grad_norm": 0.322265625, + "learning_rate": 0.0001712363108589524, + "loss": 0.0444, + "step": 1095 + }, + { + "epoch": 0.29002702524553425, + "grad_norm": 0.408203125, + "learning_rate": 0.00017110436733078243, + "loss": 0.0828, + "step": 1100 + }, + { + "epoch": 0.2913453299057412, + "grad_norm": 0.04052734375, + "learning_rate": 0.0001709724238026125, + "loss": 0.0725, + "step": 1105 + }, + { + "epoch": 0.2926636345659482, + "grad_norm": 0.2578125, + "learning_rate": 0.00017084048027444254, + "loss": 0.0204, + "step": 1110 + }, + { + "epoch": 0.2939819392261552, + "grad_norm": 0.67578125, + "learning_rate": 0.0001707085367462726, + "loss": 0.0503, + "step": 1115 + }, + { + "epoch": 0.2953002438863621, + "grad_norm": 0.0059814453125, + "learning_rate": 0.00017057659321810265, + "loss": 0.0144, + "step": 1120 + }, + { + "epoch": 0.2966185485465691, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00017044464968993272, + "loss": 0.0044, + "step": 1125 + }, + { + "epoch": 0.2979368532067761, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001703127061617628, + "loss": 0.013, + "step": 1130 + }, + { + "epoch": 0.29925515786698303, + "grad_norm": 0.287109375, + "learning_rate": 0.00017018076263359283, + "loss": 0.0245, + "step": 1135 + }, + { + "epoch": 0.30057346252719, + "grad_norm": 0.26171875, + "learning_rate": 0.0001700488191054229, + "loss": 0.0247, + "step": 1140 + }, + { + "epoch": 0.301891767187397, + "grad_norm": 0.40625, + "learning_rate": 0.00016991687557725294, + "loss": 0.0402, + "step": 1145 + }, + { + "epoch": 0.30321007184760396, + "grad_norm": 1.2578125, + "learning_rate": 0.000169784932049083, + "loss": 0.0071, + "step": 1150 + }, + { + "epoch": 0.30452837650781095, + "grad_norm": 0.330078125, + "learning_rate": 0.00016965298852091306, + "loss": 0.0177, + "step": 1155 + }, + { + "epoch": 0.30584668116801794, + "grad_norm": 0.07275390625, + "learning_rate": 0.0001695210449927431, + "loss": 0.0029, + "step": 1160 + }, + { + "epoch": 0.3071649858282249, + "grad_norm": 0.455078125, + "learning_rate": 0.00016938910146457317, + "loss": 0.0262, + "step": 1165 + }, + { + "epoch": 0.30848329048843187, + "grad_norm": 0.002655029296875, + "learning_rate": 0.0001692571579364032, + "loss": 0.0346, + "step": 1170 + }, + { + "epoch": 0.30980159514863886, + "grad_norm": 0.1748046875, + "learning_rate": 0.00016912521440823328, + "loss": 0.0494, + "step": 1175 + }, + { + "epoch": 0.3111198998088458, + "grad_norm": 1.4609375, + "learning_rate": 0.00016899327088006335, + "loss": 0.0603, + "step": 1180 + }, + { + "epoch": 0.3124382044690528, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001688613273518934, + "loss": 0.0366, + "step": 1185 + }, + { + "epoch": 0.3137565091292598, + "grad_norm": 0.01422119140625, + "learning_rate": 0.00016872938382372346, + "loss": 0.0678, + "step": 1190 + }, + { + "epoch": 0.3150748137894667, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001685974402955535, + "loss": 0.0359, + "step": 1195 + }, + { + "epoch": 0.3163931184496737, + "grad_norm": 0.275390625, + "learning_rate": 0.00016846549676738357, + "loss": 0.1099, + "step": 1200 + }, + { + "epoch": 0.3177114231098807, + "grad_norm": 0.212890625, + "learning_rate": 0.00016833355323921364, + "loss": 0.0343, + "step": 1205 + }, + { + "epoch": 0.31902972777008765, + "grad_norm": 0.0302734375, + "learning_rate": 0.00016820160971104368, + "loss": 0.0138, + "step": 1210 + }, + { + "epoch": 0.32034803243029464, + "grad_norm": 0.016845703125, + "learning_rate": 0.00016806966618287375, + "loss": 0.0202, + "step": 1215 + }, + { + "epoch": 0.32166633709050163, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001679377226547038, + "loss": 0.0442, + "step": 1220 + }, + { + "epoch": 0.32298464175070857, + "grad_norm": 0.049072265625, + "learning_rate": 0.00016780577912653386, + "loss": 0.0375, + "step": 1225 + }, + { + "epoch": 0.32430294641091556, + "grad_norm": 0.1337890625, + "learning_rate": 0.0001676738355983639, + "loss": 0.01, + "step": 1230 + }, + { + "epoch": 0.32562125107112255, + "grad_norm": 0.02197265625, + "learning_rate": 0.00016754189207019397, + "loss": 0.0139, + "step": 1235 + }, + { + "epoch": 0.3269395557313295, + "grad_norm": 0.09228515625, + "learning_rate": 0.00016740994854202404, + "loss": 0.014, + "step": 1240 + }, + { + "epoch": 0.3282578603915365, + "grad_norm": 0.47265625, + "learning_rate": 0.00016727800501385408, + "loss": 0.1546, + "step": 1245 + }, + { + "epoch": 0.3295761650517435, + "grad_norm": 0.02294921875, + "learning_rate": 0.00016714606148568413, + "loss": 0.0803, + "step": 1250 + }, + { + "epoch": 0.3308944697119504, + "grad_norm": 0.185546875, + "learning_rate": 0.00016701411795751417, + "loss": 0.0376, + "step": 1255 + }, + { + "epoch": 0.3322127743721574, + "grad_norm": 0.1123046875, + "learning_rate": 0.00016688217442934424, + "loss": 0.0375, + "step": 1260 + }, + { + "epoch": 0.3335310790323644, + "grad_norm": 1.03125, + "learning_rate": 0.0001667502309011743, + "loss": 0.0442, + "step": 1265 + }, + { + "epoch": 0.33484938369257133, + "grad_norm": 0.0172119140625, + "learning_rate": 0.00016661828737300435, + "loss": 0.0261, + "step": 1270 + }, + { + "epoch": 0.3361676883527783, + "grad_norm": 0.42578125, + "learning_rate": 0.00016648634384483442, + "loss": 0.0553, + "step": 1275 + }, + { + "epoch": 0.3374859930129853, + "grad_norm": 0.1328125, + "learning_rate": 0.00016635440031666446, + "loss": 0.0065, + "step": 1280 + }, + { + "epoch": 0.33880429767319226, + "grad_norm": 0.263671875, + "learning_rate": 0.00016622245678849453, + "loss": 0.0527, + "step": 1285 + }, + { + "epoch": 0.34012260233339925, + "grad_norm": 0.314453125, + "learning_rate": 0.0001660905132603246, + "loss": 0.0297, + "step": 1290 + }, + { + "epoch": 0.34144090699360624, + "grad_norm": 0.04345703125, + "learning_rate": 0.00016595856973215464, + "loss": 0.0477, + "step": 1295 + }, + { + "epoch": 0.3427592116538132, + "grad_norm": 0.08154296875, + "learning_rate": 0.0001658266262039847, + "loss": 0.0298, + "step": 1300 + }, + { + "epoch": 0.34407751631402017, + "grad_norm": 0.08935546875, + "learning_rate": 0.00016569468267581475, + "loss": 0.0481, + "step": 1305 + }, + { + "epoch": 0.34539582097422716, + "grad_norm": 0.06640625, + "learning_rate": 0.00016556273914764482, + "loss": 0.0153, + "step": 1310 + }, + { + "epoch": 0.3467141256344341, + "grad_norm": 0.00592041015625, + "learning_rate": 0.00016543079561947486, + "loss": 0.0111, + "step": 1315 + }, + { + "epoch": 0.3480324302946411, + "grad_norm": 0.2236328125, + "learning_rate": 0.00016529885209130493, + "loss": 0.0309, + "step": 1320 + }, + { + "epoch": 0.3493507349548481, + "grad_norm": 0.0198974609375, + "learning_rate": 0.000165166908563135, + "loss": 0.0579, + "step": 1325 + }, + { + "epoch": 0.350669039615055, + "grad_norm": 0.10107421875, + "learning_rate": 0.00016503496503496504, + "loss": 0.0055, + "step": 1330 + }, + { + "epoch": 0.351987344275262, + "grad_norm": 0.71875, + "learning_rate": 0.00016490302150679511, + "loss": 0.0299, + "step": 1335 + }, + { + "epoch": 0.353305648935469, + "grad_norm": 0.01348876953125, + "learning_rate": 0.00016477107797862516, + "loss": 0.0943, + "step": 1340 + }, + { + "epoch": 0.35462395359567594, + "grad_norm": 0.3046875, + "learning_rate": 0.00016463913445045523, + "loss": 0.0216, + "step": 1345 + }, + { + "epoch": 0.35594225825588294, + "grad_norm": 0.02392578125, + "learning_rate": 0.00016450719092228527, + "loss": 0.0265, + "step": 1350 + }, + { + "epoch": 0.35726056291608993, + "grad_norm": 0.453125, + "learning_rate": 0.0001643752473941153, + "loss": 0.0539, + "step": 1355 + }, + { + "epoch": 0.35857886757629687, + "grad_norm": 0.00823974609375, + "learning_rate": 0.00016424330386594538, + "loss": 0.0139, + "step": 1360 + }, + { + "epoch": 0.35989717223650386, + "grad_norm": 0.55859375, + "learning_rate": 0.00016411136033777542, + "loss": 0.0428, + "step": 1365 + }, + { + "epoch": 0.36121547689671085, + "grad_norm": 0.052734375, + "learning_rate": 0.0001639794168096055, + "loss": 0.0346, + "step": 1370 + }, + { + "epoch": 0.3625337815569178, + "grad_norm": 0.12158203125, + "learning_rate": 0.00016384747328143556, + "loss": 0.0095, + "step": 1375 + }, + { + "epoch": 0.3638520862171248, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0001637155297532656, + "loss": 0.0224, + "step": 1380 + }, + { + "epoch": 0.3651703908773318, + "grad_norm": 0.01318359375, + "learning_rate": 0.00016358358622509567, + "loss": 0.0316, + "step": 1385 + }, + { + "epoch": 0.3664886955375387, + "grad_norm": 0.011962890625, + "learning_rate": 0.0001634516426969257, + "loss": 0.0051, + "step": 1390 + }, + { + "epoch": 0.3678070001977457, + "grad_norm": 0.00396728515625, + "learning_rate": 0.00016331969916875578, + "loss": 0.038, + "step": 1395 + }, + { + "epoch": 0.3691253048579527, + "grad_norm": 0.375, + "learning_rate": 0.00016318775564058585, + "loss": 0.029, + "step": 1400 + }, + { + "epoch": 0.37044360951815963, + "grad_norm": 0.265625, + "learning_rate": 0.0001630558121124159, + "loss": 0.0072, + "step": 1405 + }, + { + "epoch": 0.3717619141783666, + "grad_norm": 0.00127410888671875, + "learning_rate": 0.00016292386858424596, + "loss": 0.0381, + "step": 1410 + }, + { + "epoch": 0.3730802188385736, + "grad_norm": 1.15625, + "learning_rate": 0.000162791925056076, + "loss": 0.0573, + "step": 1415 + }, + { + "epoch": 0.37439852349878056, + "grad_norm": 0.0244140625, + "learning_rate": 0.00016265998152790607, + "loss": 0.051, + "step": 1420 + }, + { + "epoch": 0.37571682815898755, + "grad_norm": 0.0015106201171875, + "learning_rate": 0.00016252803799973612, + "loss": 0.0239, + "step": 1425 + }, + { + "epoch": 0.37703513281919454, + "grad_norm": 0.26953125, + "learning_rate": 0.00016239609447156618, + "loss": 0.0165, + "step": 1430 + }, + { + "epoch": 0.3783534374794015, + "grad_norm": 0.006134033203125, + "learning_rate": 0.00016226415094339625, + "loss": 0.0071, + "step": 1435 + }, + { + "epoch": 0.37967174213960847, + "grad_norm": 2.828125, + "learning_rate": 0.0001621322074152263, + "loss": 0.0272, + "step": 1440 + }, + { + "epoch": 0.38099004679981546, + "grad_norm": 0.349609375, + "learning_rate": 0.00016200026388705637, + "loss": 0.0647, + "step": 1445 + }, + { + "epoch": 0.3823083514600224, + "grad_norm": 0.09326171875, + "learning_rate": 0.00016186832035888638, + "loss": 0.0262, + "step": 1450 + }, + { + "epoch": 0.3836266561202294, + "grad_norm": 0.041015625, + "learning_rate": 0.00016173637683071645, + "loss": 0.0576, + "step": 1455 + }, + { + "epoch": 0.3849449607804364, + "grad_norm": 0.033935546875, + "learning_rate": 0.00016160443330254652, + "loss": 0.0142, + "step": 1460 + }, + { + "epoch": 0.3862632654406433, + "grad_norm": 0.09130859375, + "learning_rate": 0.00016147248977437656, + "loss": 0.0348, + "step": 1465 + }, + { + "epoch": 0.3875815701008503, + "grad_norm": 2.390625, + "learning_rate": 0.00016134054624620663, + "loss": 0.0672, + "step": 1470 + }, + { + "epoch": 0.3888998747610573, + "grad_norm": 0.439453125, + "learning_rate": 0.00016120860271803667, + "loss": 0.0121, + "step": 1475 + }, + { + "epoch": 0.39021817942126424, + "grad_norm": 0.1298828125, + "learning_rate": 0.00016107665918986674, + "loss": 0.0114, + "step": 1480 + }, + { + "epoch": 0.39153648408147124, + "grad_norm": 0.85546875, + "learning_rate": 0.0001609447156616968, + "loss": 0.0968, + "step": 1485 + }, + { + "epoch": 0.39285478874167823, + "grad_norm": 0.703125, + "learning_rate": 0.00016081277213352685, + "loss": 0.0349, + "step": 1490 + }, + { + "epoch": 0.39417309340188517, + "grad_norm": 0.021728515625, + "learning_rate": 0.00016068082860535692, + "loss": 0.0106, + "step": 1495 + }, + { + "epoch": 0.39549139806209216, + "grad_norm": 0.7265625, + "learning_rate": 0.00016054888507718696, + "loss": 0.0225, + "step": 1500 + }, + { + "epoch": 0.39549139806209216, + "eval_loss": 0.03515048325061798, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.3497, + "eval_samples_per_second": 7.373, + "eval_steps_per_second": 3.686, + "step": 1500 + }, + { + "epoch": 0.3968097027222991, + "grad_norm": 0.016519820317626, + "learning_rate": 0.00016041694154901703, + "loss": 0.0202, + "step": 1505 + }, + { + "epoch": 0.3981280073825061, + "grad_norm": 0.8505942225456238, + "learning_rate": 0.00016028499802084708, + "loss": 0.0541, + "step": 1510 + }, + { + "epoch": 0.3994463120427131, + "grad_norm": 0.04163295030593872, + "learning_rate": 0.00016015305449267714, + "loss": 0.0037, + "step": 1515 + }, + { + "epoch": 0.40076461670292, + "grad_norm": 0.011332935653626919, + "learning_rate": 0.00016002111096450721, + "loss": 0.0459, + "step": 1520 + }, + { + "epoch": 0.402082921363127, + "grad_norm": 0.9360129833221436, + "learning_rate": 0.00015988916743633726, + "loss": 0.013, + "step": 1525 + }, + { + "epoch": 0.403401226023334, + "grad_norm": 0.11991436779499054, + "learning_rate": 0.00015975722390816733, + "loss": 0.0079, + "step": 1530 + }, + { + "epoch": 0.40471953068354094, + "grad_norm": 0.36911076307296753, + "learning_rate": 0.00015962528037999737, + "loss": 0.0638, + "step": 1535 + }, + { + "epoch": 0.40603783534374793, + "grad_norm": 0.020278634503483772, + "learning_rate": 0.00015949333685182744, + "loss": 0.0217, + "step": 1540 + }, + { + "epoch": 0.4073561400039549, + "grad_norm": 0.14263059198856354, + "learning_rate": 0.0001593613933236575, + "loss": 0.0495, + "step": 1545 + }, + { + "epoch": 0.40867444466416186, + "grad_norm": 0.09494803845882416, + "learning_rate": 0.00015922944979548752, + "loss": 0.0248, + "step": 1550 + }, + { + "epoch": 0.40999274932436885, + "grad_norm": 0.23064319789409637, + "learning_rate": 0.0001590975062673176, + "loss": 0.0285, + "step": 1555 + }, + { + "epoch": 0.41131105398457585, + "grad_norm": 0.32220256328582764, + "learning_rate": 0.00015896556273914763, + "loss": 0.0537, + "step": 1560 + }, + { + "epoch": 0.4126293586447828, + "grad_norm": 0.41208815574645996, + "learning_rate": 0.0001588336192109777, + "loss": 0.0453, + "step": 1565 + }, + { + "epoch": 0.4139476633049898, + "grad_norm": 0.03775424137711525, + "learning_rate": 0.00015870167568280777, + "loss": 0.0134, + "step": 1570 + }, + { + "epoch": 0.41526596796519677, + "grad_norm": 0.6526333093643188, + "learning_rate": 0.0001585697321546378, + "loss": 0.0329, + "step": 1575 + }, + { + "epoch": 0.4165842726254037, + "grad_norm": 1.001305103302002, + "learning_rate": 0.00015843778862646788, + "loss": 0.0912, + "step": 1580 + }, + { + "epoch": 0.4179025772856107, + "grad_norm": 0.4055219888687134, + "learning_rate": 0.00015830584509829792, + "loss": 0.0519, + "step": 1585 + }, + { + "epoch": 0.4192208819458177, + "grad_norm": 0.035015616565942764, + "learning_rate": 0.000158173901570128, + "loss": 0.0191, + "step": 1590 + }, + { + "epoch": 0.42053918660602463, + "grad_norm": 0.09326844662427902, + "learning_rate": 0.00015804195804195806, + "loss": 0.0106, + "step": 1595 + }, + { + "epoch": 0.4218574912662316, + "grad_norm": 0.06223440542817116, + "learning_rate": 0.0001579100145137881, + "loss": 0.0113, + "step": 1600 + }, + { + "epoch": 0.4231757959264386, + "grad_norm": 0.0625135526061058, + "learning_rate": 0.00015777807098561817, + "loss": 0.0191, + "step": 1605 + }, + { + "epoch": 0.42449410058664555, + "grad_norm": 0.2645983099937439, + "learning_rate": 0.00015764612745744822, + "loss": 0.0829, + "step": 1610 + }, + { + "epoch": 0.42581240524685254, + "grad_norm": 0.009632415138185024, + "learning_rate": 0.00015751418392927829, + "loss": 0.0542, + "step": 1615 + }, + { + "epoch": 0.42713070990705954, + "grad_norm": 0.01979319378733635, + "learning_rate": 0.00015738224040110833, + "loss": 0.0517, + "step": 1620 + }, + { + "epoch": 0.4284490145672665, + "grad_norm": 0.3065454065799713, + "learning_rate": 0.0001572502968729384, + "loss": 0.0738, + "step": 1625 + }, + { + "epoch": 0.42976731922747347, + "grad_norm": 0.09581473469734192, + "learning_rate": 0.00015711835334476847, + "loss": 0.0571, + "step": 1630 + }, + { + "epoch": 0.43108562388768046, + "grad_norm": 0.23746591806411743, + "learning_rate": 0.0001569864098165985, + "loss": 0.0128, + "step": 1635 + }, + { + "epoch": 0.4324039285478874, + "grad_norm": 0.936278760433197, + "learning_rate": 0.00015685446628842858, + "loss": 0.0665, + "step": 1640 + }, + { + "epoch": 0.4337222332080944, + "grad_norm": 0.18487441539764404, + "learning_rate": 0.00015672252276025862, + "loss": 0.0527, + "step": 1645 + }, + { + "epoch": 0.4350405378683014, + "grad_norm": 0.6980624794960022, + "learning_rate": 0.00015659057923208866, + "loss": 0.0613, + "step": 1650 + }, + { + "epoch": 0.4363588425285083, + "grad_norm": 0.4696301221847534, + "learning_rate": 0.00015645863570391873, + "loss": 0.0569, + "step": 1655 + }, + { + "epoch": 0.4376771471887153, + "grad_norm": 0.15083105862140656, + "learning_rate": 0.00015632669217574877, + "loss": 0.0394, + "step": 1660 + }, + { + "epoch": 0.4389954518489223, + "grad_norm": 0.44701239466667175, + "learning_rate": 0.00015619474864757884, + "loss": 0.0494, + "step": 1665 + }, + { + "epoch": 0.44031375650912924, + "grad_norm": 0.07418403029441833, + "learning_rate": 0.00015606280511940888, + "loss": 0.0291, + "step": 1670 + }, + { + "epoch": 0.44163206116933623, + "grad_norm": 0.02311861515045166, + "learning_rate": 0.00015593086159123895, + "loss": 0.0304, + "step": 1675 + }, + { + "epoch": 0.4429503658295432, + "grad_norm": 0.4416038990020752, + "learning_rate": 0.00015579891806306902, + "loss": 0.0176, + "step": 1680 + }, + { + "epoch": 0.44426867048975016, + "grad_norm": 0.5124915242195129, + "learning_rate": 0.00015566697453489906, + "loss": 0.0454, + "step": 1685 + }, + { + "epoch": 0.44558697514995715, + "grad_norm": 0.3159286081790924, + "learning_rate": 0.00015553503100672913, + "loss": 0.047, + "step": 1690 + }, + { + "epoch": 0.44690527981016415, + "grad_norm": 0.032126396894454956, + "learning_rate": 0.00015540308747855918, + "loss": 0.0151, + "step": 1695 + }, + { + "epoch": 0.4482235844703711, + "grad_norm": 0.04663548618555069, + "learning_rate": 0.00015527114395038924, + "loss": 0.0375, + "step": 1700 + }, + { + "epoch": 0.4495418891305781, + "grad_norm": 0.013753900304436684, + "learning_rate": 0.0001551392004222193, + "loss": 0.0485, + "step": 1705 + }, + { + "epoch": 0.45086019379078507, + "grad_norm": 1.9952393770217896, + "learning_rate": 0.00015500725689404936, + "loss": 0.0625, + "step": 1710 + }, + { + "epoch": 0.452178498450992, + "grad_norm": 0.014283270575106144, + "learning_rate": 0.00015487531336587943, + "loss": 0.0037, + "step": 1715 + }, + { + "epoch": 0.453496803111199, + "grad_norm": 0.3897913098335266, + "learning_rate": 0.00015474336983770947, + "loss": 0.0304, + "step": 1720 + }, + { + "epoch": 0.454815107771406, + "grad_norm": 0.3730885684490204, + "learning_rate": 0.00015461142630953954, + "loss": 0.0115, + "step": 1725 + }, + { + "epoch": 0.45613341243161293, + "grad_norm": 0.035858724266290665, + "learning_rate": 0.00015447948278136958, + "loss": 0.0021, + "step": 1730 + }, + { + "epoch": 0.4574517170918199, + "grad_norm": 0.20589517056941986, + "learning_rate": 0.00015434753925319965, + "loss": 0.0132, + "step": 1735 + }, + { + "epoch": 0.4587700217520269, + "grad_norm": 0.004939342383295298, + "learning_rate": 0.00015421559572502972, + "loss": 0.0471, + "step": 1740 + }, + { + "epoch": 0.46008832641223385, + "grad_norm": 0.03493283689022064, + "learning_rate": 0.00015408365219685976, + "loss": 0.0062, + "step": 1745 + }, + { + "epoch": 0.46140663107244084, + "grad_norm": 0.045927103608846664, + "learning_rate": 0.0001539517086686898, + "loss": 0.0283, + "step": 1750 + }, + { + "epoch": 0.46272493573264784, + "grad_norm": 0.012629454955458641, + "learning_rate": 0.00015381976514051984, + "loss": 0.0133, + "step": 1755 + }, + { + "epoch": 0.46404324039285477, + "grad_norm": 0.8001697659492493, + "learning_rate": 0.0001536878216123499, + "loss": 0.0224, + "step": 1760 + }, + { + "epoch": 0.46536154505306176, + "grad_norm": 0.002036362886428833, + "learning_rate": 0.00015355587808417998, + "loss": 0.0066, + "step": 1765 + }, + { + "epoch": 0.46667984971326876, + "grad_norm": 1.0261330604553223, + "learning_rate": 0.00015342393455601002, + "loss": 0.191, + "step": 1770 + }, + { + "epoch": 0.4679981543734757, + "grad_norm": 0.3033429682254791, + "learning_rate": 0.0001532919910278401, + "loss": 0.0222, + "step": 1775 + }, + { + "epoch": 0.4693164590336827, + "grad_norm": 0.36911338567733765, + "learning_rate": 0.00015316004749967014, + "loss": 0.0363, + "step": 1780 + }, + { + "epoch": 0.4706347636938897, + "grad_norm": 0.0406811460852623, + "learning_rate": 0.0001530281039715002, + "loss": 0.0283, + "step": 1785 + }, + { + "epoch": 0.4719530683540966, + "grad_norm": 0.23334211111068726, + "learning_rate": 0.00015289616044333027, + "loss": 0.0274, + "step": 1790 + }, + { + "epoch": 0.4732713730143036, + "grad_norm": 0.013081169687211514, + "learning_rate": 0.00015276421691516032, + "loss": 0.0221, + "step": 1795 + }, + { + "epoch": 0.4745896776745106, + "grad_norm": 0.2480790615081787, + "learning_rate": 0.00015263227338699039, + "loss": 0.019, + "step": 1800 + }, + { + "epoch": 0.47590798233471754, + "grad_norm": 0.0373196005821228, + "learning_rate": 0.00015250032985882043, + "loss": 0.0292, + "step": 1805 + }, + { + "epoch": 0.47722628699492453, + "grad_norm": 0.004609994124621153, + "learning_rate": 0.0001523683863306505, + "loss": 0.0918, + "step": 1810 + }, + { + "epoch": 0.4785445916551315, + "grad_norm": 0.02370987832546234, + "learning_rate": 0.00015223644280248054, + "loss": 0.0462, + "step": 1815 + }, + { + "epoch": 0.47986289631533846, + "grad_norm": 0.05842221528291702, + "learning_rate": 0.0001521044992743106, + "loss": 0.0595, + "step": 1820 + }, + { + "epoch": 0.48118120097554545, + "grad_norm": 0.009685276076197624, + "learning_rate": 0.00015197255574614068, + "loss": 0.0074, + "step": 1825 + }, + { + "epoch": 0.48249950563575245, + "grad_norm": 0.8933250308036804, + "learning_rate": 0.00015184061221797072, + "loss": 0.0757, + "step": 1830 + }, + { + "epoch": 0.4838178102959594, + "grad_norm": 0.07075401395559311, + "learning_rate": 0.0001517086686898008, + "loss": 0.0226, + "step": 1835 + }, + { + "epoch": 0.4851361149561664, + "grad_norm": 0.732706606388092, + "learning_rate": 0.00015157672516163083, + "loss": 0.0161, + "step": 1840 + }, + { + "epoch": 0.48645441961637337, + "grad_norm": 1.1897023916244507, + "learning_rate": 0.0001514447816334609, + "loss": 0.0265, + "step": 1845 + }, + { + "epoch": 0.4877727242765803, + "grad_norm": 0.052572328597307205, + "learning_rate": 0.00015131283810529094, + "loss": 0.0094, + "step": 1850 + }, + { + "epoch": 0.4890910289367873, + "grad_norm": 0.08263898640871048, + "learning_rate": 0.00015118089457712098, + "loss": 0.0631, + "step": 1855 + }, + { + "epoch": 0.4904093335969943, + "grad_norm": 0.03225664421916008, + "learning_rate": 0.00015104895104895105, + "loss": 0.023, + "step": 1860 + }, + { + "epoch": 0.4917276382572012, + "grad_norm": 0.007935039699077606, + "learning_rate": 0.0001509170075207811, + "loss": 0.0039, + "step": 1865 + }, + { + "epoch": 0.4930459429174082, + "grad_norm": 0.00830796267837286, + "learning_rate": 0.00015078506399261116, + "loss": 0.007, + "step": 1870 + }, + { + "epoch": 0.4943642475776152, + "grad_norm": 0.08042234182357788, + "learning_rate": 0.00015065312046444123, + "loss": 0.0366, + "step": 1875 + }, + { + "epoch": 0.49568255223782215, + "grad_norm": 0.009092851541936398, + "learning_rate": 0.00015052117693627128, + "loss": 0.0107, + "step": 1880 + }, + { + "epoch": 0.49700085689802914, + "grad_norm": 0.2674141824245453, + "learning_rate": 0.00015038923340810135, + "loss": 0.0076, + "step": 1885 + }, + { + "epoch": 0.49831916155823613, + "grad_norm": 0.07694366574287415, + "learning_rate": 0.0001502572898799314, + "loss": 0.0252, + "step": 1890 + }, + { + "epoch": 0.49963746621844307, + "grad_norm": 0.5699467062950134, + "learning_rate": 0.00015012534635176146, + "loss": 0.0487, + "step": 1895 + }, + { + "epoch": 0.5009557708786501, + "grad_norm": 0.18800878524780273, + "learning_rate": 0.0001499934028235915, + "loss": 0.0183, + "step": 1900 + }, + { + "epoch": 0.5022740755388571, + "grad_norm": 0.019469989463686943, + "learning_rate": 0.00014986145929542157, + "loss": 0.0268, + "step": 1905 + }, + { + "epoch": 0.503592380199064, + "grad_norm": 0.01890506222844124, + "learning_rate": 0.00014972951576725164, + "loss": 0.0449, + "step": 1910 + }, + { + "epoch": 0.5049106848592709, + "grad_norm": 0.0006314461352303624, + "learning_rate": 0.00014959757223908168, + "loss": 0.0056, + "step": 1915 + }, + { + "epoch": 0.5062289895194779, + "grad_norm": 0.32654041051864624, + "learning_rate": 0.00014946562871091175, + "loss": 0.0256, + "step": 1920 + }, + { + "epoch": 0.5075472941796849, + "grad_norm": 0.7803483605384827, + "learning_rate": 0.0001493336851827418, + "loss": 0.0374, + "step": 1925 + }, + { + "epoch": 0.5088655988398919, + "grad_norm": 0.028441445901989937, + "learning_rate": 0.00014920174165457186, + "loss": 0.0161, + "step": 1930 + }, + { + "epoch": 0.5101839035000989, + "grad_norm": 0.028379200026392937, + "learning_rate": 0.00014906979812640193, + "loss": 0.0151, + "step": 1935 + }, + { + "epoch": 0.5115022081603059, + "grad_norm": 0.021159596741199493, + "learning_rate": 0.00014893785459823197, + "loss": 0.0303, + "step": 1940 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.24903325736522675, + "learning_rate": 0.000148805911070062, + "loss": 0.0076, + "step": 1945 + }, + { + "epoch": 0.5141388174807198, + "grad_norm": 0.007065301761031151, + "learning_rate": 0.00014867396754189206, + "loss": 0.022, + "step": 1950 + }, + { + "epoch": 0.5154571221409268, + "grad_norm": 0.004032329190522432, + "learning_rate": 0.00014854202401372212, + "loss": 0.0083, + "step": 1955 + }, + { + "epoch": 0.5167754268011338, + "grad_norm": 0.3045775592327118, + "learning_rate": 0.0001484100804855522, + "loss": 0.0113, + "step": 1960 + }, + { + "epoch": 0.5180937314613407, + "grad_norm": 0.36974939703941345, + "learning_rate": 0.00014827813695738224, + "loss": 0.0267, + "step": 1965 + }, + { + "epoch": 0.5194120361215477, + "grad_norm": 0.009729950688779354, + "learning_rate": 0.0001481461934292123, + "loss": 0.027, + "step": 1970 + }, + { + "epoch": 0.5207303407817546, + "grad_norm": 0.0013097926275804639, + "learning_rate": 0.00014801424990104235, + "loss": 0.003, + "step": 1975 + }, + { + "epoch": 0.5220486454419616, + "grad_norm": 0.0706263929605484, + "learning_rate": 0.00014788230637287242, + "loss": 0.0193, + "step": 1980 + }, + { + "epoch": 0.5233669501021686, + "grad_norm": 1.435702919960022, + "learning_rate": 0.00014775036284470249, + "loss": 0.0647, + "step": 1985 + }, + { + "epoch": 0.5246852547623756, + "grad_norm": 0.00661757867783308, + "learning_rate": 0.00014761841931653253, + "loss": 0.0373, + "step": 1990 + }, + { + "epoch": 0.5260035594225826, + "grad_norm": 0.12014541029930115, + "learning_rate": 0.0001474864757883626, + "loss": 0.0178, + "step": 1995 + }, + { + "epoch": 0.5273218640827896, + "grad_norm": 1.0549248456954956, + "learning_rate": 0.00014735453226019264, + "loss": 0.0191, + "step": 2000 + }, + { + "epoch": 0.5273218640827896, + "eval_loss": 0.037292081862688065, + "eval_runtime": 454.3033, + "eval_samples_per_second": 7.422, + "eval_steps_per_second": 3.711, + "step": 2000 + }, + { + "epoch": 0.5286401687429965, + "grad_norm": 0.47634151577949524, + "learning_rate": 0.0001472225887320227, + "loss": 0.0404, + "step": 2005 + }, + { + "epoch": 0.5299584734032035, + "grad_norm": 0.006752463988959789, + "learning_rate": 0.00014709064520385275, + "loss": 0.034, + "step": 2010 + }, + { + "epoch": 0.5312767780634104, + "grad_norm": 0.20780125260353088, + "learning_rate": 0.00014695870167568282, + "loss": 0.0421, + "step": 2015 + }, + { + "epoch": 0.5325950827236174, + "grad_norm": 0.010941066779196262, + "learning_rate": 0.0001468267581475129, + "loss": 0.0086, + "step": 2020 + }, + { + "epoch": 0.5339133873838244, + "grad_norm": 0.3439581096172333, + "learning_rate": 0.00014669481461934293, + "loss": 0.0187, + "step": 2025 + }, + { + "epoch": 0.5352316920440314, + "grad_norm": 0.14961636066436768, + "learning_rate": 0.000146562871091173, + "loss": 0.0504, + "step": 2030 + }, + { + "epoch": 0.5365499967042383, + "grad_norm": 0.0044641937129199505, + "learning_rate": 0.00014643092756300304, + "loss": 0.0134, + "step": 2035 + }, + { + "epoch": 0.5378683013644453, + "grad_norm": 0.14088386297225952, + "learning_rate": 0.0001462989840348331, + "loss": 0.0096, + "step": 2040 + }, + { + "epoch": 0.5391866060246523, + "grad_norm": 0.48116979002952576, + "learning_rate": 0.00014616704050666315, + "loss": 0.0124, + "step": 2045 + }, + { + "epoch": 0.5405049106848593, + "grad_norm": 0.3688766360282898, + "learning_rate": 0.0001460350969784932, + "loss": 0.0226, + "step": 2050 + }, + { + "epoch": 0.5418232153450663, + "grad_norm": 0.002938181860372424, + "learning_rate": 0.00014590315345032326, + "loss": 0.0267, + "step": 2055 + }, + { + "epoch": 0.5431415200052733, + "grad_norm": 0.3335214853286743, + "learning_rate": 0.0001457712099221533, + "loss": 0.0367, + "step": 2060 + }, + { + "epoch": 0.5444598246654802, + "grad_norm": 0.004644686821848154, + "learning_rate": 0.00014563926639398338, + "loss": 0.0121, + "step": 2065 + }, + { + "epoch": 0.5457781293256871, + "grad_norm": 0.19505545496940613, + "learning_rate": 0.00014550732286581345, + "loss": 0.0591, + "step": 2070 + }, + { + "epoch": 0.5470964339858941, + "grad_norm": 0.018028756603598595, + "learning_rate": 0.0001453753793376435, + "loss": 0.0131, + "step": 2075 + }, + { + "epoch": 0.5484147386461011, + "grad_norm": 0.045639291405677795, + "learning_rate": 0.00014524343580947356, + "loss": 0.0443, + "step": 2080 + }, + { + "epoch": 0.5497330433063081, + "grad_norm": 0.727981686592102, + "learning_rate": 0.0001451114922813036, + "loss": 0.0205, + "step": 2085 + }, + { + "epoch": 0.5510513479665151, + "grad_norm": 0.03766491636633873, + "learning_rate": 0.00014497954875313367, + "loss": 0.0067, + "step": 2090 + }, + { + "epoch": 0.552369652626722, + "grad_norm": 0.1911504715681076, + "learning_rate": 0.0001448476052249637, + "loss": 0.0397, + "step": 2095 + }, + { + "epoch": 0.553687957286929, + "grad_norm": 0.08238353580236435, + "learning_rate": 0.00014471566169679378, + "loss": 0.0513, + "step": 2100 + }, + { + "epoch": 0.555006261947136, + "grad_norm": 0.06317206472158432, + "learning_rate": 0.00014458371816862385, + "loss": 0.0178, + "step": 2105 + }, + { + "epoch": 0.556324566607343, + "grad_norm": 0.0652734637260437, + "learning_rate": 0.0001444517746404539, + "loss": 0.0184, + "step": 2110 + }, + { + "epoch": 0.55764287126755, + "grad_norm": 0.05471858009696007, + "learning_rate": 0.00014431983111228396, + "loss": 0.0089, + "step": 2115 + }, + { + "epoch": 0.558961175927757, + "grad_norm": 0.005062670446932316, + "learning_rate": 0.000144187887584114, + "loss": 0.0052, + "step": 2120 + }, + { + "epoch": 0.5602794805879638, + "grad_norm": 0.06337414681911469, + "learning_rate": 0.00014405594405594407, + "loss": 0.053, + "step": 2125 + }, + { + "epoch": 0.5615977852481708, + "grad_norm": 0.33745357394218445, + "learning_rate": 0.00014392400052777414, + "loss": 0.0166, + "step": 2130 + }, + { + "epoch": 0.5629160899083778, + "grad_norm": 0.7382741570472717, + "learning_rate": 0.00014379205699960418, + "loss": 0.0191, + "step": 2135 + }, + { + "epoch": 0.5642343945685848, + "grad_norm": 0.007551972754299641, + "learning_rate": 0.00014366011347143425, + "loss": 0.0022, + "step": 2140 + }, + { + "epoch": 0.5655526992287918, + "grad_norm": 0.6260896921157837, + "learning_rate": 0.00014352816994326427, + "loss": 0.0095, + "step": 2145 + }, + { + "epoch": 0.5668710038889987, + "grad_norm": 0.11619322001934052, + "learning_rate": 0.00014339622641509434, + "loss": 0.015, + "step": 2150 + }, + { + "epoch": 0.5681893085492057, + "grad_norm": 1.1440670490264893, + "learning_rate": 0.0001432642828869244, + "loss": 0.1343, + "step": 2155 + }, + { + "epoch": 0.5695076132094127, + "grad_norm": 1.1793878078460693, + "learning_rate": 0.00014313233935875445, + "loss": 0.0968, + "step": 2160 + }, + { + "epoch": 0.5708259178696197, + "grad_norm": 0.6865736842155457, + "learning_rate": 0.00014300039583058452, + "loss": 0.0195, + "step": 2165 + }, + { + "epoch": 0.5721442225298267, + "grad_norm": 0.140816792845726, + "learning_rate": 0.00014286845230241456, + "loss": 0.0761, + "step": 2170 + }, + { + "epoch": 0.5734625271900337, + "grad_norm": 0.04071786254644394, + "learning_rate": 0.00014273650877424463, + "loss": 0.0193, + "step": 2175 + }, + { + "epoch": 0.5747808318502405, + "grad_norm": 0.044617727398872375, + "learning_rate": 0.0001426045652460747, + "loss": 0.0112, + "step": 2180 + }, + { + "epoch": 0.5760991365104475, + "grad_norm": 0.11001799255609512, + "learning_rate": 0.00014247262171790474, + "loss": 0.0039, + "step": 2185 + }, + { + "epoch": 0.5774174411706545, + "grad_norm": 0.0036315324250608683, + "learning_rate": 0.0001423406781897348, + "loss": 0.0038, + "step": 2190 + }, + { + "epoch": 0.5787357458308615, + "grad_norm": 0.9866570830345154, + "learning_rate": 0.00014220873466156485, + "loss": 0.025, + "step": 2195 + }, + { + "epoch": 0.5800540504910685, + "grad_norm": 0.023570384830236435, + "learning_rate": 0.00014207679113339492, + "loss": 0.0468, + "step": 2200 + }, + { + "epoch": 0.5813723551512755, + "grad_norm": 0.20010559260845184, + "learning_rate": 0.00014194484760522496, + "loss": 0.0198, + "step": 2205 + }, + { + "epoch": 0.5826906598114824, + "grad_norm": 0.06153270602226257, + "learning_rate": 0.00014181290407705503, + "loss": 0.0764, + "step": 2210 + }, + { + "epoch": 0.5840089644716894, + "grad_norm": 0.033162448555231094, + "learning_rate": 0.0001416809605488851, + "loss": 0.028, + "step": 2215 + }, + { + "epoch": 0.5853272691318964, + "grad_norm": 0.428382933139801, + "learning_rate": 0.00014154901702071514, + "loss": 0.0652, + "step": 2220 + }, + { + "epoch": 0.5866455737921034, + "grad_norm": 0.25004762411117554, + "learning_rate": 0.0001414170734925452, + "loss": 0.0411, + "step": 2225 + }, + { + "epoch": 0.5879638784523104, + "grad_norm": 0.22649863362312317, + "learning_rate": 0.00014128512996437525, + "loss": 0.0517, + "step": 2230 + }, + { + "epoch": 0.5892821831125173, + "grad_norm": 0.035932112485170364, + "learning_rate": 0.00014115318643620532, + "loss": 0.015, + "step": 2235 + }, + { + "epoch": 0.5906004877727242, + "grad_norm": 0.3800172507762909, + "learning_rate": 0.00014102124290803536, + "loss": 0.0324, + "step": 2240 + }, + { + "epoch": 0.5919187924329312, + "grad_norm": 0.6974118947982788, + "learning_rate": 0.0001408892993798654, + "loss": 0.0216, + "step": 2245 + }, + { + "epoch": 0.5932370970931382, + "grad_norm": 0.15472032129764557, + "learning_rate": 0.00014075735585169548, + "loss": 0.0164, + "step": 2250 + }, + { + "epoch": 0.5945554017533452, + "grad_norm": 0.015000814571976662, + "learning_rate": 0.00014062541232352552, + "loss": 0.0395, + "step": 2255 + }, + { + "epoch": 0.5958737064135522, + "grad_norm": 0.052086081355810165, + "learning_rate": 0.0001404934687953556, + "loss": 0.0032, + "step": 2260 + }, + { + "epoch": 0.5971920110737592, + "grad_norm": 0.004600350745022297, + "learning_rate": 0.00014036152526718566, + "loss": 0.0056, + "step": 2265 + }, + { + "epoch": 0.5985103157339661, + "grad_norm": 0.4940958321094513, + "learning_rate": 0.0001402295817390157, + "loss": 0.0206, + "step": 2270 + }, + { + "epoch": 0.5998286203941731, + "grad_norm": 0.09658394008874893, + "learning_rate": 0.00014009763821084577, + "loss": 0.0052, + "step": 2275 + }, + { + "epoch": 0.60114692505438, + "grad_norm": 0.00020539117394946516, + "learning_rate": 0.0001399656946826758, + "loss": 0.087, + "step": 2280 + }, + { + "epoch": 0.602465229714587, + "grad_norm": 0.1871018409729004, + "learning_rate": 0.00013983375115450588, + "loss": 0.0812, + "step": 2285 + }, + { + "epoch": 0.603783534374794, + "grad_norm": 0.02583954855799675, + "learning_rate": 0.00013970180762633592, + "loss": 0.0232, + "step": 2290 + }, + { + "epoch": 0.605101839035001, + "grad_norm": 1.2103784084320068, + "learning_rate": 0.000139569864098166, + "loss": 0.0151, + "step": 2295 + }, + { + "epoch": 0.6064201436952079, + "grad_norm": 0.023514943197369576, + "learning_rate": 0.00013943792056999606, + "loss": 0.0193, + "step": 2300 + }, + { + "epoch": 0.6077384483554149, + "grad_norm": 0.0076395305804908276, + "learning_rate": 0.0001393059770418261, + "loss": 0.0379, + "step": 2305 + }, + { + "epoch": 0.6090567530156219, + "grad_norm": 0.12412039190530777, + "learning_rate": 0.00013917403351365617, + "loss": 0.0095, + "step": 2310 + }, + { + "epoch": 0.6103750576758289, + "grad_norm": 0.021904783323407173, + "learning_rate": 0.0001390420899854862, + "loss": 0.0166, + "step": 2315 + }, + { + "epoch": 0.6116933623360359, + "grad_norm": 0.004012851510196924, + "learning_rate": 0.00013891014645731628, + "loss": 0.0103, + "step": 2320 + }, + { + "epoch": 0.6130116669962429, + "grad_norm": 0.007267913781106472, + "learning_rate": 0.00013877820292914635, + "loss": 0.0708, + "step": 2325 + }, + { + "epoch": 0.6143299716564498, + "grad_norm": 0.10363642126321793, + "learning_rate": 0.0001386462594009764, + "loss": 0.0473, + "step": 2330 + }, + { + "epoch": 0.6156482763166568, + "grad_norm": 0.04899830371141434, + "learning_rate": 0.00013851431587280646, + "loss": 0.0283, + "step": 2335 + }, + { + "epoch": 0.6169665809768637, + "grad_norm": 0.39460498094558716, + "learning_rate": 0.0001383823723446365, + "loss": 0.0597, + "step": 2340 + }, + { + "epoch": 0.6182848856370707, + "grad_norm": 0.04092290997505188, + "learning_rate": 0.00013825042881646655, + "loss": 0.0167, + "step": 2345 + }, + { + "epoch": 0.6196031902972777, + "grad_norm": 0.2781132161617279, + "learning_rate": 0.00013811848528829662, + "loss": 0.0097, + "step": 2350 + }, + { + "epoch": 0.6209214949574847, + "grad_norm": 0.041443537920713425, + "learning_rate": 0.00013798654176012666, + "loss": 0.0226, + "step": 2355 + }, + { + "epoch": 0.6222397996176916, + "grad_norm": 0.1242462694644928, + "learning_rate": 0.00013785459823195673, + "loss": 0.0055, + "step": 2360 + }, + { + "epoch": 0.6235581042778986, + "grad_norm": 0.4440467357635498, + "learning_rate": 0.00013772265470378677, + "loss": 0.049, + "step": 2365 + }, + { + "epoch": 0.6248764089381056, + "grad_norm": 0.014354427345097065, + "learning_rate": 0.00013759071117561684, + "loss": 0.0327, + "step": 2370 + }, + { + "epoch": 0.6261947135983126, + "grad_norm": 0.011539973318576813, + "learning_rate": 0.0001374587676474469, + "loss": 0.0222, + "step": 2375 + }, + { + "epoch": 0.6275130182585196, + "grad_norm": 0.23539051413536072, + "learning_rate": 0.00013732682411927695, + "loss": 0.0816, + "step": 2380 + }, + { + "epoch": 0.6288313229187266, + "grad_norm": 0.26793941855430603, + "learning_rate": 0.00013719488059110702, + "loss": 0.0325, + "step": 2385 + }, + { + "epoch": 0.6301496275789334, + "grad_norm": 0.01662217453122139, + "learning_rate": 0.00013706293706293706, + "loss": 0.0221, + "step": 2390 + }, + { + "epoch": 0.6314679322391404, + "grad_norm": 0.30669671297073364, + "learning_rate": 0.00013693099353476713, + "loss": 0.026, + "step": 2395 + }, + { + "epoch": 0.6327862368993474, + "grad_norm": 0.03350894898176193, + "learning_rate": 0.00013679905000659717, + "loss": 0.0072, + "step": 2400 + }, + { + "epoch": 0.6341045415595544, + "grad_norm": 0.014983875676989555, + "learning_rate": 0.00013666710647842724, + "loss": 0.049, + "step": 2405 + }, + { + "epoch": 0.6354228462197614, + "grad_norm": 1.8989384174346924, + "learning_rate": 0.0001365351629502573, + "loss": 0.0335, + "step": 2410 + }, + { + "epoch": 0.6367411508799684, + "grad_norm": 0.030135562643408775, + "learning_rate": 0.00013640321942208735, + "loss": 0.0051, + "step": 2415 + }, + { + "epoch": 0.6380594555401753, + "grad_norm": 0.02079075388610363, + "learning_rate": 0.00013627127589391742, + "loss": 0.0138, + "step": 2420 + }, + { + "epoch": 0.6393777602003823, + "grad_norm": 0.06065403297543526, + "learning_rate": 0.00013613933236574746, + "loss": 0.0357, + "step": 2425 + }, + { + "epoch": 0.6406960648605893, + "grad_norm": 0.2980937659740448, + "learning_rate": 0.00013600738883757753, + "loss": 0.0138, + "step": 2430 + }, + { + "epoch": 0.6420143695207963, + "grad_norm": 0.4820438623428345, + "learning_rate": 0.00013587544530940758, + "loss": 0.01, + "step": 2435 + }, + { + "epoch": 0.6433326741810033, + "grad_norm": 0.005618259310722351, + "learning_rate": 0.00013574350178123765, + "loss": 0.0052, + "step": 2440 + }, + { + "epoch": 0.6446509788412103, + "grad_norm": 0.7173821926116943, + "learning_rate": 0.0001356115582530677, + "loss": 0.0133, + "step": 2445 + }, + { + "epoch": 0.6459692835014171, + "grad_norm": 0.0053142281249165535, + "learning_rate": 0.00013547961472489773, + "loss": 0.0045, + "step": 2450 + }, + { + "epoch": 0.6472875881616241, + "grad_norm": 0.06118829548358917, + "learning_rate": 0.0001353476711967278, + "loss": 0.056, + "step": 2455 + }, + { + "epoch": 0.6486058928218311, + "grad_norm": 3.5878078937530518, + "learning_rate": 0.00013521572766855787, + "loss": 0.0232, + "step": 2460 + }, + { + "epoch": 0.6499241974820381, + "grad_norm": 0.004911276511847973, + "learning_rate": 0.0001350837841403879, + "loss": 0.0074, + "step": 2465 + }, + { + "epoch": 0.6512425021422451, + "grad_norm": 0.0028026222717016935, + "learning_rate": 0.00013495184061221798, + "loss": 0.0782, + "step": 2470 + }, + { + "epoch": 0.6525608068024521, + "grad_norm": 0.7317615747451782, + "learning_rate": 0.00013481989708404802, + "loss": 0.0222, + "step": 2475 + }, + { + "epoch": 0.653879111462659, + "grad_norm": 0.01835751160979271, + "learning_rate": 0.0001346879535558781, + "loss": 0.0661, + "step": 2480 + }, + { + "epoch": 0.655197416122866, + "grad_norm": 0.03598962351679802, + "learning_rate": 0.00013455601002770813, + "loss": 0.0395, + "step": 2485 + }, + { + "epoch": 0.656515720783073, + "grad_norm": 0.013886351138353348, + "learning_rate": 0.0001344240664995382, + "loss": 0.0156, + "step": 2490 + }, + { + "epoch": 0.65783402544328, + "grad_norm": 5.741530895233154, + "learning_rate": 0.00013429212297136827, + "loss": 0.0317, + "step": 2495 + }, + { + "epoch": 0.659152330103487, + "grad_norm": 0.20793496072292328, + "learning_rate": 0.0001341601794431983, + "loss": 0.0072, + "step": 2500 + }, + { + "epoch": 0.659152330103487, + "eval_loss": 0.0300898440182209, + "eval_runtime": 453.0554, + "eval_samples_per_second": 7.443, + "eval_steps_per_second": 3.721, + "step": 2500 + }, + { + "epoch": 0.6604706347636939, + "grad_norm": 0.03460961952805519, + "learning_rate": 0.00013402823591502838, + "loss": 0.0097, + "step": 2505 + }, + { + "epoch": 0.6617889394239008, + "grad_norm": 0.31785696744918823, + "learning_rate": 0.00013389629238685842, + "loss": 0.0303, + "step": 2510 + }, + { + "epoch": 0.6631072440841078, + "grad_norm": 0.4273851215839386, + "learning_rate": 0.0001337643488586885, + "loss": 0.0499, + "step": 2515 + }, + { + "epoch": 0.6644255487443148, + "grad_norm": 0.02236153744161129, + "learning_rate": 0.00013363240533051856, + "loss": 0.0069, + "step": 2520 + }, + { + "epoch": 0.6657438534045218, + "grad_norm": 0.1592864990234375, + "learning_rate": 0.0001335004618023486, + "loss": 0.0326, + "step": 2525 + }, + { + "epoch": 0.6670621580647288, + "grad_norm": 0.029961545020341873, + "learning_rate": 0.00013336851827417867, + "loss": 0.0178, + "step": 2530 + }, + { + "epoch": 0.6683804627249358, + "grad_norm": 0.03120764158666134, + "learning_rate": 0.00013323657474600872, + "loss": 0.115, + "step": 2535 + }, + { + "epoch": 0.6696987673851427, + "grad_norm": 0.01060028001666069, + "learning_rate": 0.00013310463121783879, + "loss": 0.0036, + "step": 2540 + }, + { + "epoch": 0.6710170720453497, + "grad_norm": 0.053470809012651443, + "learning_rate": 0.00013297268768966883, + "loss": 0.0079, + "step": 2545 + }, + { + "epoch": 0.6723353767055567, + "grad_norm": 0.022777097299695015, + "learning_rate": 0.00013284074416149887, + "loss": 0.0078, + "step": 2550 + }, + { + "epoch": 0.6736536813657636, + "grad_norm": 0.0548521913588047, + "learning_rate": 0.00013270880063332894, + "loss": 0.0503, + "step": 2555 + }, + { + "epoch": 0.6749719860259706, + "grad_norm": 0.02028457075357437, + "learning_rate": 0.00013257685710515898, + "loss": 0.0096, + "step": 2560 + }, + { + "epoch": 0.6762902906861776, + "grad_norm": 0.01569107361137867, + "learning_rate": 0.00013244491357698905, + "loss": 0.008, + "step": 2565 + }, + { + "epoch": 0.6776085953463845, + "grad_norm": 0.00743742985650897, + "learning_rate": 0.00013231297004881912, + "loss": 0.005, + "step": 2570 + }, + { + "epoch": 0.6789269000065915, + "grad_norm": 0.025164416059851646, + "learning_rate": 0.00013218102652064916, + "loss": 0.018, + "step": 2575 + }, + { + "epoch": 0.6802452046667985, + "grad_norm": 0.3653188645839691, + "learning_rate": 0.00013204908299247923, + "loss": 0.0295, + "step": 2580 + }, + { + "epoch": 0.6815635093270055, + "grad_norm": 0.685422956943512, + "learning_rate": 0.00013191713946430927, + "loss": 0.0335, + "step": 2585 + }, + { + "epoch": 0.6828818139872125, + "grad_norm": 0.675740122795105, + "learning_rate": 0.00013178519593613934, + "loss": 0.0592, + "step": 2590 + }, + { + "epoch": 0.6842001186474194, + "grad_norm": 0.10513252764940262, + "learning_rate": 0.00013165325240796938, + "loss": 0.0353, + "step": 2595 + }, + { + "epoch": 0.6855184233076264, + "grad_norm": 0.43512973189353943, + "learning_rate": 0.00013152130887979945, + "loss": 0.0142, + "step": 2600 + }, + { + "epoch": 0.6868367279678333, + "grad_norm": 0.029436839744448662, + "learning_rate": 0.00013138936535162952, + "loss": 0.0042, + "step": 2605 + }, + { + "epoch": 0.6881550326280403, + "grad_norm": 0.5607122778892517, + "learning_rate": 0.00013125742182345957, + "loss": 0.0184, + "step": 2610 + }, + { + "epoch": 0.6894733372882473, + "grad_norm": 0.11365406215190887, + "learning_rate": 0.00013112547829528963, + "loss": 0.006, + "step": 2615 + }, + { + "epoch": 0.6907916419484543, + "grad_norm": 0.047227244824171066, + "learning_rate": 0.00013099353476711968, + "loss": 0.008, + "step": 2620 + }, + { + "epoch": 0.6921099466086612, + "grad_norm": 0.0005877618095837533, + "learning_rate": 0.00013086159123894975, + "loss": 0.0286, + "step": 2625 + }, + { + "epoch": 0.6934282512688682, + "grad_norm": 0.010759112425148487, + "learning_rate": 0.0001307296477107798, + "loss": 0.0062, + "step": 2630 + }, + { + "epoch": 0.6947465559290752, + "grad_norm": 0.07117745280265808, + "learning_rate": 0.00013059770418260986, + "loss": 0.0891, + "step": 2635 + }, + { + "epoch": 0.6960648605892822, + "grad_norm": 0.0639057606458664, + "learning_rate": 0.00013046576065443993, + "loss": 0.0072, + "step": 2640 + }, + { + "epoch": 0.6973831652494892, + "grad_norm": 0.027350090444087982, + "learning_rate": 0.00013033381712626994, + "loss": 0.0103, + "step": 2645 + }, + { + "epoch": 0.6987014699096962, + "grad_norm": 0.015336195938289165, + "learning_rate": 0.0001302018735981, + "loss": 0.0041, + "step": 2650 + }, + { + "epoch": 0.700019774569903, + "grad_norm": 1.0650830268859863, + "learning_rate": 0.00013006993006993008, + "loss": 0.0443, + "step": 2655 + }, + { + "epoch": 0.70133807923011, + "grad_norm": 0.019073212519288063, + "learning_rate": 0.00012993798654176012, + "loss": 0.0331, + "step": 2660 + }, + { + "epoch": 0.702656383890317, + "grad_norm": 0.10109209269285202, + "learning_rate": 0.0001298060430135902, + "loss": 0.0054, + "step": 2665 + }, + { + "epoch": 0.703974688550524, + "grad_norm": 0.03528957813978195, + "learning_rate": 0.00012967409948542023, + "loss": 0.0427, + "step": 2670 + }, + { + "epoch": 0.705292993210731, + "grad_norm": 0.03577788919210434, + "learning_rate": 0.0001295421559572503, + "loss": 0.023, + "step": 2675 + }, + { + "epoch": 0.706611297870938, + "grad_norm": 0.5576180815696716, + "learning_rate": 0.00012941021242908034, + "loss": 0.0416, + "step": 2680 + }, + { + "epoch": 0.7079296025311449, + "grad_norm": 0.017131298780441284, + "learning_rate": 0.0001292782689009104, + "loss": 0.0235, + "step": 2685 + }, + { + "epoch": 0.7092479071913519, + "grad_norm": 0.8517888784408569, + "learning_rate": 0.00012914632537274048, + "loss": 0.0168, + "step": 2690 + }, + { + "epoch": 0.7105662118515589, + "grad_norm": 0.23812156915664673, + "learning_rate": 0.00012901438184457052, + "loss": 0.0483, + "step": 2695 + }, + { + "epoch": 0.7118845165117659, + "grad_norm": 0.11746613681316376, + "learning_rate": 0.0001288824383164006, + "loss": 0.0255, + "step": 2700 + }, + { + "epoch": 0.7132028211719729, + "grad_norm": 0.20089928805828094, + "learning_rate": 0.00012875049478823064, + "loss": 0.0267, + "step": 2705 + }, + { + "epoch": 0.7145211258321799, + "grad_norm": 0.8301129937171936, + "learning_rate": 0.0001286185512600607, + "loss": 0.016, + "step": 2710 + }, + { + "epoch": 0.7158394304923867, + "grad_norm": 0.01838674768805504, + "learning_rate": 0.00012848660773189077, + "loss": 0.0229, + "step": 2715 + }, + { + "epoch": 0.7171577351525937, + "grad_norm": 0.03670337051153183, + "learning_rate": 0.00012835466420372082, + "loss": 0.038, + "step": 2720 + }, + { + "epoch": 0.7184760398128007, + "grad_norm": 0.0452633760869503, + "learning_rate": 0.00012822272067555089, + "loss": 0.0622, + "step": 2725 + }, + { + "epoch": 0.7197943444730077, + "grad_norm": 0.09503110498189926, + "learning_rate": 0.00012809077714738093, + "loss": 0.0209, + "step": 2730 + }, + { + "epoch": 0.7211126491332147, + "grad_norm": 1.0327308177947998, + "learning_rate": 0.000127958833619211, + "loss": 0.0361, + "step": 2735 + }, + { + "epoch": 0.7224309537934217, + "grad_norm": 1.0049290657043457, + "learning_rate": 0.00012782689009104104, + "loss": 0.0365, + "step": 2740 + }, + { + "epoch": 0.7237492584536286, + "grad_norm": 0.029774073511362076, + "learning_rate": 0.00012769494656287108, + "loss": 0.0257, + "step": 2745 + }, + { + "epoch": 0.7250675631138356, + "grad_norm": 0.20974040031433105, + "learning_rate": 0.00012756300303470115, + "loss": 0.0542, + "step": 2750 + }, + { + "epoch": 0.7263858677740426, + "grad_norm": 0.8153854608535767, + "learning_rate": 0.0001274310595065312, + "loss": 0.0216, + "step": 2755 + }, + { + "epoch": 0.7277041724342496, + "grad_norm": 0.4393698573112488, + "learning_rate": 0.00012729911597836126, + "loss": 0.0451, + "step": 2760 + }, + { + "epoch": 0.7290224770944566, + "grad_norm": 0.06990349292755127, + "learning_rate": 0.00012716717245019133, + "loss": 0.03, + "step": 2765 + }, + { + "epoch": 0.7303407817546635, + "grad_norm": 0.32689470052719116, + "learning_rate": 0.00012703522892202137, + "loss": 0.0263, + "step": 2770 + }, + { + "epoch": 0.7316590864148704, + "grad_norm": 0.026600876823067665, + "learning_rate": 0.00012690328539385144, + "loss": 0.0404, + "step": 2775 + }, + { + "epoch": 0.7329773910750774, + "grad_norm": 0.11228257417678833, + "learning_rate": 0.00012677134186568148, + "loss": 0.0224, + "step": 2780 + }, + { + "epoch": 0.7342956957352844, + "grad_norm": 0.6469443440437317, + "learning_rate": 0.00012663939833751155, + "loss": 0.0178, + "step": 2785 + }, + { + "epoch": 0.7356140003954914, + "grad_norm": 0.020773250609636307, + "learning_rate": 0.0001265074548093416, + "loss": 0.011, + "step": 2790 + }, + { + "epoch": 0.7369323050556984, + "grad_norm": 0.7378728985786438, + "learning_rate": 0.00012637551128117167, + "loss": 0.0227, + "step": 2795 + }, + { + "epoch": 0.7382506097159054, + "grad_norm": 0.008189595304429531, + "learning_rate": 0.00012624356775300173, + "loss": 0.0892, + "step": 2800 + }, + { + "epoch": 0.7395689143761123, + "grad_norm": 0.031633853912353516, + "learning_rate": 0.00012611162422483178, + "loss": 0.0093, + "step": 2805 + }, + { + "epoch": 0.7408872190363193, + "grad_norm": 0.5078475475311279, + "learning_rate": 0.00012597968069666185, + "loss": 0.0567, + "step": 2810 + }, + { + "epoch": 0.7422055236965263, + "grad_norm": 0.21766887605190277, + "learning_rate": 0.0001258477371684919, + "loss": 0.0485, + "step": 2815 + }, + { + "epoch": 0.7435238283567333, + "grad_norm": 0.3029612898826599, + "learning_rate": 0.00012571579364032196, + "loss": 0.032, + "step": 2820 + }, + { + "epoch": 0.7448421330169402, + "grad_norm": 1.2135159969329834, + "learning_rate": 0.00012558385011215203, + "loss": 0.0139, + "step": 2825 + }, + { + "epoch": 0.7461604376771472, + "grad_norm": 0.016875172033905983, + "learning_rate": 0.00012545190658398207, + "loss": 0.0323, + "step": 2830 + }, + { + "epoch": 0.7474787423373541, + "grad_norm": 0.08923230320215225, + "learning_rate": 0.00012531996305581214, + "loss": 0.0343, + "step": 2835 + }, + { + "epoch": 0.7487970469975611, + "grad_norm": 0.2958766520023346, + "learning_rate": 0.00012518801952764215, + "loss": 0.0431, + "step": 2840 + }, + { + "epoch": 0.7501153516577681, + "grad_norm": 0.7344386577606201, + "learning_rate": 0.00012505607599947222, + "loss": 0.0389, + "step": 2845 + }, + { + "epoch": 0.7514336563179751, + "grad_norm": 0.03681635856628418, + "learning_rate": 0.0001249241324713023, + "loss": 0.0258, + "step": 2850 + }, + { + "epoch": 0.7527519609781821, + "grad_norm": 0.22866861522197723, + "learning_rate": 0.00012479218894313233, + "loss": 0.0223, + "step": 2855 + }, + { + "epoch": 0.7540702656383891, + "grad_norm": 0.029770435765385628, + "learning_rate": 0.0001246602454149624, + "loss": 0.0205, + "step": 2860 + }, + { + "epoch": 0.755388570298596, + "grad_norm": 0.011845707893371582, + "learning_rate": 0.00012452830188679244, + "loss": 0.0252, + "step": 2865 + }, + { + "epoch": 0.756706874958803, + "grad_norm": 0.06696149706840515, + "learning_rate": 0.00012439635835862251, + "loss": 0.0166, + "step": 2870 + }, + { + "epoch": 0.75802517961901, + "grad_norm": 0.01653144136071205, + "learning_rate": 0.00012426441483045256, + "loss": 0.0487, + "step": 2875 + }, + { + "epoch": 0.7593434842792169, + "grad_norm": 0.031312476843595505, + "learning_rate": 0.00012413247130228263, + "loss": 0.0155, + "step": 2880 + }, + { + "epoch": 0.7606617889394239, + "grad_norm": 0.011625733226537704, + "learning_rate": 0.0001240005277741127, + "loss": 0.0333, + "step": 2885 + }, + { + "epoch": 0.7619800935996309, + "grad_norm": 0.012089414522051811, + "learning_rate": 0.00012386858424594274, + "loss": 0.003, + "step": 2890 + }, + { + "epoch": 0.7632983982598378, + "grad_norm": 0.3012307584285736, + "learning_rate": 0.0001237366407177728, + "loss": 0.0172, + "step": 2895 + }, + { + "epoch": 0.7646167029200448, + "grad_norm": 0.31575000286102295, + "learning_rate": 0.00012360469718960285, + "loss": 0.0409, + "step": 2900 + }, + { + "epoch": 0.7659350075802518, + "grad_norm": 0.009794364683330059, + "learning_rate": 0.00012347275366143292, + "loss": 0.0214, + "step": 2905 + }, + { + "epoch": 0.7672533122404588, + "grad_norm": 0.5973085165023804, + "learning_rate": 0.00012334081013326299, + "loss": 0.0245, + "step": 2910 + }, + { + "epoch": 0.7685716169006658, + "grad_norm": 0.019750040024518967, + "learning_rate": 0.00012320886660509303, + "loss": 0.0063, + "step": 2915 + }, + { + "epoch": 0.7698899215608728, + "grad_norm": 0.06402858346700668, + "learning_rate": 0.0001230769230769231, + "loss": 0.0444, + "step": 2920 + }, + { + "epoch": 0.7712082262210797, + "grad_norm": 0.02876671403646469, + "learning_rate": 0.00012294497954875314, + "loss": 0.0103, + "step": 2925 + }, + { + "epoch": 0.7725265308812866, + "grad_norm": 0.6962207555770874, + "learning_rate": 0.0001228130360205832, + "loss": 0.0318, + "step": 2930 + }, + { + "epoch": 0.7738448355414936, + "grad_norm": 0.006536522414535284, + "learning_rate": 0.00012268109249241325, + "loss": 0.0096, + "step": 2935 + }, + { + "epoch": 0.7751631402017006, + "grad_norm": 0.07097168266773224, + "learning_rate": 0.0001225491489642433, + "loss": 0.0174, + "step": 2940 + }, + { + "epoch": 0.7764814448619076, + "grad_norm": 0.042360126972198486, + "learning_rate": 0.00012241720543607336, + "loss": 0.0158, + "step": 2945 + }, + { + "epoch": 0.7777997495221146, + "grad_norm": 0.01159572321921587, + "learning_rate": 0.0001222852619079034, + "loss": 0.0265, + "step": 2950 + }, + { + "epoch": 0.7791180541823215, + "grad_norm": 0.38408163189888, + "learning_rate": 0.00012215331837973347, + "loss": 0.0233, + "step": 2955 + }, + { + "epoch": 0.7804363588425285, + "grad_norm": 0.15588605403900146, + "learning_rate": 0.00012202137485156353, + "loss": 0.0041, + "step": 2960 + }, + { + "epoch": 0.7817546635027355, + "grad_norm": 0.006892362609505653, + "learning_rate": 0.00012188943132339358, + "loss": 0.0026, + "step": 2965 + }, + { + "epoch": 0.7830729681629425, + "grad_norm": 0.030915727838873863, + "learning_rate": 0.00012175748779522364, + "loss": 0.0028, + "step": 2970 + }, + { + "epoch": 0.7843912728231495, + "grad_norm": 0.8151025772094727, + "learning_rate": 0.00012162554426705371, + "loss": 0.0429, + "step": 2975 + }, + { + "epoch": 0.7857095774833565, + "grad_norm": 0.6765475273132324, + "learning_rate": 0.00012149360073888377, + "loss": 0.0319, + "step": 2980 + }, + { + "epoch": 0.7870278821435633, + "grad_norm": 0.054469238966703415, + "learning_rate": 0.00012136165721071382, + "loss": 0.0413, + "step": 2985 + }, + { + "epoch": 0.7883461868037703, + "grad_norm": 0.045610666275024414, + "learning_rate": 0.00012122971368254388, + "loss": 0.0521, + "step": 2990 + }, + { + "epoch": 0.7896644914639773, + "grad_norm": 0.4222470223903656, + "learning_rate": 0.00012109777015437393, + "loss": 0.0846, + "step": 2995 + }, + { + "epoch": 0.7909827961241843, + "grad_norm": 0.0272397268563509, + "learning_rate": 0.00012096582662620399, + "loss": 0.0364, + "step": 3000 + }, + { + "epoch": 0.7909827961241843, + "eval_loss": 0.033312585204839706, + "eval_runtime": 452.2552, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 3.728, + "step": 3000 + }, + { + "epoch": 0.7923011007843913, + "grad_norm": 0.08674059063196182, + "learning_rate": 0.00012083388309803406, + "loss": 0.0081, + "step": 3005 + }, + { + "epoch": 0.7936194054445982, + "grad_norm": 0.21960832178592682, + "learning_rate": 0.00012070193956986411, + "loss": 0.0468, + "step": 3010 + }, + { + "epoch": 0.7949377101048052, + "grad_norm": 0.11259289085865021, + "learning_rate": 0.00012056999604169417, + "loss": 0.0124, + "step": 3015 + }, + { + "epoch": 0.7962560147650122, + "grad_norm": 0.02945362776517868, + "learning_rate": 0.00012043805251352422, + "loss": 0.0298, + "step": 3020 + }, + { + "epoch": 0.7975743194252192, + "grad_norm": 0.27889615297317505, + "learning_rate": 0.00012030610898535428, + "loss": 0.0251, + "step": 3025 + }, + { + "epoch": 0.7988926240854262, + "grad_norm": 0.05873241275548935, + "learning_rate": 0.00012017416545718434, + "loss": 0.0132, + "step": 3030 + }, + { + "epoch": 0.8002109287456332, + "grad_norm": 0.1570046991109848, + "learning_rate": 0.00012004222192901439, + "loss": 0.0228, + "step": 3035 + }, + { + "epoch": 0.80152923340584, + "grad_norm": 0.12575332820415497, + "learning_rate": 0.00011991027840084443, + "loss": 0.0049, + "step": 3040 + }, + { + "epoch": 0.802847538066047, + "grad_norm": 0.8416435122489929, + "learning_rate": 0.00011977833487267449, + "loss": 0.0542, + "step": 3045 + }, + { + "epoch": 0.804165842726254, + "grad_norm": 0.2605098485946655, + "learning_rate": 0.00011964639134450454, + "loss": 0.0084, + "step": 3050 + }, + { + "epoch": 0.805484147386461, + "grad_norm": 0.8996294736862183, + "learning_rate": 0.00011951444781633461, + "loss": 0.0442, + "step": 3055 + }, + { + "epoch": 0.806802452046668, + "grad_norm": 2.7525105476379395, + "learning_rate": 0.00011938250428816467, + "loss": 0.0642, + "step": 3060 + }, + { + "epoch": 0.808120756706875, + "grad_norm": 0.14955930411815643, + "learning_rate": 0.00011925056075999473, + "loss": 0.0384, + "step": 3065 + }, + { + "epoch": 0.8094390613670819, + "grad_norm": 0.018756115809082985, + "learning_rate": 0.00011911861723182478, + "loss": 0.0154, + "step": 3070 + }, + { + "epoch": 0.8107573660272889, + "grad_norm": 0.23998615145683289, + "learning_rate": 0.00011898667370365484, + "loss": 0.0413, + "step": 3075 + }, + { + "epoch": 0.8120756706874959, + "grad_norm": 0.27253249287605286, + "learning_rate": 0.00011885473017548489, + "loss": 0.0081, + "step": 3080 + }, + { + "epoch": 0.8133939753477029, + "grad_norm": 0.2925993502140045, + "learning_rate": 0.00011872278664731495, + "loss": 0.0332, + "step": 3085 + }, + { + "epoch": 0.8147122800079099, + "grad_norm": 0.5364832878112793, + "learning_rate": 0.00011859084311914502, + "loss": 0.0143, + "step": 3090 + }, + { + "epoch": 0.8160305846681168, + "grad_norm": 0.32104921340942383, + "learning_rate": 0.00011845889959097507, + "loss": 0.0216, + "step": 3095 + }, + { + "epoch": 0.8173488893283237, + "grad_norm": 0.0205856766551733, + "learning_rate": 0.00011832695606280513, + "loss": 0.0346, + "step": 3100 + }, + { + "epoch": 0.8186671939885307, + "grad_norm": 0.2541547417640686, + "learning_rate": 0.00011819501253463518, + "loss": 0.0793, + "step": 3105 + }, + { + "epoch": 0.8199854986487377, + "grad_norm": 0.08333491533994675, + "learning_rate": 0.00011806306900646524, + "loss": 0.0049, + "step": 3110 + }, + { + "epoch": 0.8213038033089447, + "grad_norm": 0.0355968177318573, + "learning_rate": 0.0001179311254782953, + "loss": 0.0051, + "step": 3115 + }, + { + "epoch": 0.8226221079691517, + "grad_norm": 0.06948401033878326, + "learning_rate": 0.00011779918195012536, + "loss": 0.013, + "step": 3120 + }, + { + "epoch": 0.8239404126293587, + "grad_norm": 0.03328891843557358, + "learning_rate": 0.00011766723842195542, + "loss": 0.0122, + "step": 3125 + }, + { + "epoch": 0.8252587172895656, + "grad_norm": 0.013782350346446037, + "learning_rate": 0.00011753529489378548, + "loss": 0.0073, + "step": 3130 + }, + { + "epoch": 0.8265770219497726, + "grad_norm": 0.024390392005443573, + "learning_rate": 0.00011740335136561553, + "loss": 0.0143, + "step": 3135 + }, + { + "epoch": 0.8278953266099796, + "grad_norm": 0.002548128366470337, + "learning_rate": 0.00011727140783744557, + "loss": 0.0027, + "step": 3140 + }, + { + "epoch": 0.8292136312701865, + "grad_norm": 0.11674848943948746, + "learning_rate": 0.00011713946430927563, + "loss": 0.0253, + "step": 3145 + }, + { + "epoch": 0.8305319359303935, + "grad_norm": 0.005774884019047022, + "learning_rate": 0.00011700752078110568, + "loss": 0.0018, + "step": 3150 + }, + { + "epoch": 0.8318502405906005, + "grad_norm": 0.5763069987297058, + "learning_rate": 0.00011687557725293574, + "loss": 0.0119, + "step": 3155 + }, + { + "epoch": 0.8331685452508074, + "grad_norm": 0.0027607593219727278, + "learning_rate": 0.0001167436337247658, + "loss": 0.0279, + "step": 3160 + }, + { + "epoch": 0.8344868499110144, + "grad_norm": 1.859642505645752, + "learning_rate": 0.00011661169019659585, + "loss": 0.0228, + "step": 3165 + }, + { + "epoch": 0.8358051545712214, + "grad_norm": 0.16597022116184235, + "learning_rate": 0.00011647974666842592, + "loss": 0.1228, + "step": 3170 + }, + { + "epoch": 0.8371234592314284, + "grad_norm": 0.33833742141723633, + "learning_rate": 0.00011634780314025598, + "loss": 0.073, + "step": 3175 + }, + { + "epoch": 0.8384417638916354, + "grad_norm": 0.024682912975549698, + "learning_rate": 0.00011621585961208603, + "loss": 0.0042, + "step": 3180 + }, + { + "epoch": 0.8397600685518424, + "grad_norm": 0.05926942452788353, + "learning_rate": 0.00011608391608391609, + "loss": 0.0066, + "step": 3185 + }, + { + "epoch": 0.8410783732120493, + "grad_norm": 0.1414029747247696, + "learning_rate": 0.00011595197255574614, + "loss": 0.0603, + "step": 3190 + }, + { + "epoch": 0.8423966778722562, + "grad_norm": 0.37928736209869385, + "learning_rate": 0.0001158200290275762, + "loss": 0.0266, + "step": 3195 + }, + { + "epoch": 0.8437149825324632, + "grad_norm": 0.018329354003071785, + "learning_rate": 0.00011568808549940627, + "loss": 0.0047, + "step": 3200 + }, + { + "epoch": 0.8450332871926702, + "grad_norm": 0.2993735373020172, + "learning_rate": 0.00011555614197123632, + "loss": 0.0218, + "step": 3205 + }, + { + "epoch": 0.8463515918528772, + "grad_norm": 0.1767728328704834, + "learning_rate": 0.00011542419844306638, + "loss": 0.0363, + "step": 3210 + }, + { + "epoch": 0.8476698965130842, + "grad_norm": 0.39774414896965027, + "learning_rate": 0.00011529225491489644, + "loss": 0.0506, + "step": 3215 + }, + { + "epoch": 0.8489882011732911, + "grad_norm": 0.021896762773394585, + "learning_rate": 0.00011516031138672649, + "loss": 0.0081, + "step": 3220 + }, + { + "epoch": 0.8503065058334981, + "grad_norm": 0.358372300863266, + "learning_rate": 0.00011502836785855655, + "loss": 0.0224, + "step": 3225 + }, + { + "epoch": 0.8516248104937051, + "grad_norm": 0.01605542004108429, + "learning_rate": 0.00011489642433038662, + "loss": 0.0215, + "step": 3230 + }, + { + "epoch": 0.8529431151539121, + "grad_norm": 0.021189266815781593, + "learning_rate": 0.00011476448080221667, + "loss": 0.0051, + "step": 3235 + }, + { + "epoch": 0.8542614198141191, + "grad_norm": 0.013394076377153397, + "learning_rate": 0.0001146325372740467, + "loss": 0.021, + "step": 3240 + }, + { + "epoch": 0.8555797244743261, + "grad_norm": 0.19848507642745972, + "learning_rate": 0.00011450059374587676, + "loss": 0.0285, + "step": 3245 + }, + { + "epoch": 0.856898029134533, + "grad_norm": 0.2463046759366989, + "learning_rate": 0.00011436865021770683, + "loss": 0.0384, + "step": 3250 + }, + { + "epoch": 0.8582163337947399, + "grad_norm": 0.37432390451431274, + "learning_rate": 0.00011423670668953688, + "loss": 0.0098, + "step": 3255 + }, + { + "epoch": 0.8595346384549469, + "grad_norm": 0.060943394899368286, + "learning_rate": 0.00011410476316136694, + "loss": 0.0087, + "step": 3260 + }, + { + "epoch": 0.8608529431151539, + "grad_norm": 0.2846696674823761, + "learning_rate": 0.00011397281963319699, + "loss": 0.0148, + "step": 3265 + }, + { + "epoch": 0.8621712477753609, + "grad_norm": 0.009311323054134846, + "learning_rate": 0.00011384087610502705, + "loss": 0.0024, + "step": 3270 + }, + { + "epoch": 0.8634895524355679, + "grad_norm": 0.046277035027742386, + "learning_rate": 0.0001137089325768571, + "loss": 0.0274, + "step": 3275 + }, + { + "epoch": 0.8648078570957748, + "grad_norm": 0.006024620030075312, + "learning_rate": 0.00011357698904868716, + "loss": 0.0286, + "step": 3280 + }, + { + "epoch": 0.8661261617559818, + "grad_norm": 0.033578380942344666, + "learning_rate": 0.00011344504552051723, + "loss": 0.0153, + "step": 3285 + }, + { + "epoch": 0.8674444664161888, + "grad_norm": 0.8537917137145996, + "learning_rate": 0.00011331310199234728, + "loss": 0.0304, + "step": 3290 + }, + { + "epoch": 0.8687627710763958, + "grad_norm": 0.013933337293565273, + "learning_rate": 0.00011318115846417734, + "loss": 0.0112, + "step": 3295 + }, + { + "epoch": 0.8700810757366028, + "grad_norm": 0.35437721014022827, + "learning_rate": 0.0001130492149360074, + "loss": 0.0228, + "step": 3300 + }, + { + "epoch": 0.8713993803968098, + "grad_norm": 1.3024121522903442, + "learning_rate": 0.00011291727140783745, + "loss": 0.0203, + "step": 3305 + }, + { + "epoch": 0.8727176850570166, + "grad_norm": 0.5131255984306335, + "learning_rate": 0.00011278532787966751, + "loss": 0.0181, + "step": 3310 + }, + { + "epoch": 0.8740359897172236, + "grad_norm": 0.039366886019706726, + "learning_rate": 0.00011265338435149758, + "loss": 0.0192, + "step": 3315 + }, + { + "epoch": 0.8753542943774306, + "grad_norm": 0.13679669797420502, + "learning_rate": 0.00011252144082332763, + "loss": 0.004, + "step": 3320 + }, + { + "epoch": 0.8766725990376376, + "grad_norm": 0.003076886525377631, + "learning_rate": 0.00011238949729515769, + "loss": 0.0405, + "step": 3325 + }, + { + "epoch": 0.8779909036978446, + "grad_norm": 0.019953785464167595, + "learning_rate": 0.00011225755376698774, + "loss": 0.0241, + "step": 3330 + }, + { + "epoch": 0.8793092083580516, + "grad_norm": 0.007980377413332462, + "learning_rate": 0.0001121256102388178, + "loss": 0.0064, + "step": 3335 + }, + { + "epoch": 0.8806275130182585, + "grad_norm": 0.018761295825242996, + "learning_rate": 0.00011199366671064784, + "loss": 0.0032, + "step": 3340 + }, + { + "epoch": 0.8819458176784655, + "grad_norm": 0.022511709481477737, + "learning_rate": 0.0001118617231824779, + "loss": 0.0055, + "step": 3345 + }, + { + "epoch": 0.8832641223386725, + "grad_norm": 0.021270718425512314, + "learning_rate": 0.00011172977965430795, + "loss": 0.033, + "step": 3350 + }, + { + "epoch": 0.8845824269988795, + "grad_norm": 0.02710561640560627, + "learning_rate": 0.00011159783612613801, + "loss": 0.0094, + "step": 3355 + }, + { + "epoch": 0.8859007316590864, + "grad_norm": 0.4353378117084503, + "learning_rate": 0.00011146589259796806, + "loss": 0.0089, + "step": 3360 + }, + { + "epoch": 0.8872190363192934, + "grad_norm": 0.0257766991853714, + "learning_rate": 0.00011133394906979813, + "loss": 0.0059, + "step": 3365 + }, + { + "epoch": 0.8885373409795003, + "grad_norm": 0.80838942527771, + "learning_rate": 0.00011120200554162819, + "loss": 0.0263, + "step": 3370 + }, + { + "epoch": 0.8898556456397073, + "grad_norm": 0.007799761835485697, + "learning_rate": 0.00011107006201345824, + "loss": 0.0028, + "step": 3375 + }, + { + "epoch": 0.8911739502999143, + "grad_norm": 0.007315775845199823, + "learning_rate": 0.0001109381184852883, + "loss": 0.0127, + "step": 3380 + }, + { + "epoch": 0.8924922549601213, + "grad_norm": 1.4861233234405518, + "learning_rate": 0.00011080617495711836, + "loss": 0.0562, + "step": 3385 + }, + { + "epoch": 0.8938105596203283, + "grad_norm": 0.010219530202448368, + "learning_rate": 0.00011067423142894841, + "loss": 0.0438, + "step": 3390 + }, + { + "epoch": 0.8951288642805353, + "grad_norm": 1.0191857814788818, + "learning_rate": 0.00011054228790077848, + "loss": 0.0493, + "step": 3395 + }, + { + "epoch": 0.8964471689407422, + "grad_norm": 0.01459536887705326, + "learning_rate": 0.00011041034437260854, + "loss": 0.0117, + "step": 3400 + }, + { + "epoch": 0.8977654736009492, + "grad_norm": 0.008682495914399624, + "learning_rate": 0.00011027840084443859, + "loss": 0.02, + "step": 3405 + }, + { + "epoch": 0.8990837782611562, + "grad_norm": 0.02197263017296791, + "learning_rate": 0.00011014645731626865, + "loss": 0.0454, + "step": 3410 + }, + { + "epoch": 0.9004020829213631, + "grad_norm": 0.01436714269220829, + "learning_rate": 0.0001100145137880987, + "loss": 0.0283, + "step": 3415 + }, + { + "epoch": 0.9017203875815701, + "grad_norm": 0.14327946305274963, + "learning_rate": 0.00010988257025992876, + "loss": 0.0461, + "step": 3420 + }, + { + "epoch": 0.9030386922417771, + "grad_norm": 1.671773910522461, + "learning_rate": 0.00010975062673175883, + "loss": 0.054, + "step": 3425 + }, + { + "epoch": 0.904356996901984, + "grad_norm": 0.009926804341375828, + "learning_rate": 0.00010961868320358888, + "loss": 0.0429, + "step": 3430 + }, + { + "epoch": 0.905675301562191, + "grad_norm": 0.554020881652832, + "learning_rate": 0.00010948673967541894, + "loss": 0.0618, + "step": 3435 + }, + { + "epoch": 0.906993606222398, + "grad_norm": 0.1399248093366623, + "learning_rate": 0.00010935479614724897, + "loss": 0.0229, + "step": 3440 + }, + { + "epoch": 0.908311910882605, + "grad_norm": 0.02739197015762329, + "learning_rate": 0.00010922285261907904, + "loss": 0.0082, + "step": 3445 + }, + { + "epoch": 0.909630215542812, + "grad_norm": 0.33394527435302734, + "learning_rate": 0.00010909090909090909, + "loss": 0.0403, + "step": 3450 + }, + { + "epoch": 0.9109485202030189, + "grad_norm": 0.08083894103765488, + "learning_rate": 0.00010895896556273915, + "loss": 0.0406, + "step": 3455 + }, + { + "epoch": 0.9122668248632259, + "grad_norm": 0.39336663484573364, + "learning_rate": 0.0001088270220345692, + "loss": 0.02, + "step": 3460 + }, + { + "epoch": 0.9135851295234328, + "grad_norm": 0.20481553673744202, + "learning_rate": 0.00010869507850639926, + "loss": 0.0221, + "step": 3465 + }, + { + "epoch": 0.9149034341836398, + "grad_norm": 1.4507408142089844, + "learning_rate": 0.00010856313497822932, + "loss": 0.0357, + "step": 3470 + }, + { + "epoch": 0.9162217388438468, + "grad_norm": 0.2678806483745575, + "learning_rate": 0.00010843119145005937, + "loss": 0.0181, + "step": 3475 + }, + { + "epoch": 0.9175400435040538, + "grad_norm": 0.007361674215644598, + "learning_rate": 0.00010829924792188944, + "loss": 0.0978, + "step": 3480 + }, + { + "epoch": 0.9188583481642607, + "grad_norm": 0.773695707321167, + "learning_rate": 0.0001081673043937195, + "loss": 0.0401, + "step": 3485 + }, + { + "epoch": 0.9201766528244677, + "grad_norm": 0.0010772625682875514, + "learning_rate": 0.00010803536086554955, + "loss": 0.0233, + "step": 3490 + }, + { + "epoch": 0.9214949574846747, + "grad_norm": 0.08971104770898819, + "learning_rate": 0.00010790341733737961, + "loss": 0.0319, + "step": 3495 + }, + { + "epoch": 0.9228132621448817, + "grad_norm": 0.21372731029987335, + "learning_rate": 0.00010777147380920966, + "loss": 0.0315, + "step": 3500 + }, + { + "epoch": 0.9228132621448817, + "eval_loss": 0.02952708676457405, + "eval_runtime": 451.5837, + "eval_samples_per_second": 7.467, + "eval_steps_per_second": 3.734, + "step": 3500 + }, + { + "epoch": 0.9241315668050887, + "grad_norm": 0.016639264300465584, + "learning_rate": 0.00010763953028103972, + "loss": 0.0125, + "step": 3505 + }, + { + "epoch": 0.9254498714652957, + "grad_norm": 0.46340492367744446, + "learning_rate": 0.00010750758675286979, + "loss": 0.0186, + "step": 3510 + }, + { + "epoch": 0.9267681761255026, + "grad_norm": 0.01847526989877224, + "learning_rate": 0.00010737564322469984, + "loss": 0.0026, + "step": 3515 + }, + { + "epoch": 0.9280864807857095, + "grad_norm": 0.5947860479354858, + "learning_rate": 0.0001072436996965299, + "loss": 0.0259, + "step": 3520 + }, + { + "epoch": 0.9294047854459165, + "grad_norm": 0.06145291402935982, + "learning_rate": 0.00010711175616835995, + "loss": 0.0057, + "step": 3525 + }, + { + "epoch": 0.9307230901061235, + "grad_norm": 0.0143959429115057, + "learning_rate": 0.00010697981264019001, + "loss": 0.0145, + "step": 3530 + }, + { + "epoch": 0.9320413947663305, + "grad_norm": 0.21143831312656403, + "learning_rate": 0.00010684786911202007, + "loss": 0.0459, + "step": 3535 + }, + { + "epoch": 0.9333596994265375, + "grad_norm": 0.02548077143728733, + "learning_rate": 0.00010671592558385011, + "loss": 0.0051, + "step": 3540 + }, + { + "epoch": 0.9346780040867444, + "grad_norm": 0.008077048696577549, + "learning_rate": 0.00010658398205568016, + "loss": 0.0306, + "step": 3545 + }, + { + "epoch": 0.9359963087469514, + "grad_norm": 0.0030760422814637423, + "learning_rate": 0.00010645203852751022, + "loss": 0.0575, + "step": 3550 + }, + { + "epoch": 0.9373146134071584, + "grad_norm": 0.18114158511161804, + "learning_rate": 0.00010632009499934027, + "loss": 0.0885, + "step": 3555 + }, + { + "epoch": 0.9386329180673654, + "grad_norm": 0.02450549602508545, + "learning_rate": 0.00010618815147117034, + "loss": 0.0045, + "step": 3560 + }, + { + "epoch": 0.9399512227275724, + "grad_norm": 0.1238626018166542, + "learning_rate": 0.0001060562079430004, + "loss": 0.0166, + "step": 3565 + }, + { + "epoch": 0.9412695273877794, + "grad_norm": 0.1879919469356537, + "learning_rate": 0.00010592426441483046, + "loss": 0.0077, + "step": 3570 + }, + { + "epoch": 0.9425878320479862, + "grad_norm": 0.11323565989732742, + "learning_rate": 0.00010579232088666051, + "loss": 0.0213, + "step": 3575 + }, + { + "epoch": 0.9439061367081932, + "grad_norm": 0.35575854778289795, + "learning_rate": 0.00010566037735849057, + "loss": 0.0336, + "step": 3580 + }, + { + "epoch": 0.9452244413684002, + "grad_norm": 0.14052227139472961, + "learning_rate": 0.00010552843383032062, + "loss": 0.0325, + "step": 3585 + }, + { + "epoch": 0.9465427460286072, + "grad_norm": 0.2643798887729645, + "learning_rate": 0.00010539649030215069, + "loss": 0.0192, + "step": 3590 + }, + { + "epoch": 0.9478610506888142, + "grad_norm": 0.3207031190395355, + "learning_rate": 0.00010526454677398075, + "loss": 0.0221, + "step": 3595 + }, + { + "epoch": 0.9491793553490212, + "grad_norm": 0.022803861647844315, + "learning_rate": 0.0001051326032458108, + "loss": 0.029, + "step": 3600 + }, + { + "epoch": 0.9504976600092281, + "grad_norm": 0.02511664852499962, + "learning_rate": 0.00010500065971764086, + "loss": 0.0422, + "step": 3605 + }, + { + "epoch": 0.9518159646694351, + "grad_norm": 0.06505445390939713, + "learning_rate": 0.00010486871618947091, + "loss": 0.0092, + "step": 3610 + }, + { + "epoch": 0.9531342693296421, + "grad_norm": 0.09998584538698196, + "learning_rate": 0.00010473677266130097, + "loss": 0.0242, + "step": 3615 + }, + { + "epoch": 0.9544525739898491, + "grad_norm": 0.9645698666572571, + "learning_rate": 0.00010460482913313104, + "loss": 0.0124, + "step": 3620 + }, + { + "epoch": 0.955770878650056, + "grad_norm": 0.2389964610338211, + "learning_rate": 0.0001044728856049611, + "loss": 0.0169, + "step": 3625 + }, + { + "epoch": 0.957089183310263, + "grad_norm": 2.030608654022217, + "learning_rate": 0.00010434094207679115, + "loss": 0.0518, + "step": 3630 + }, + { + "epoch": 0.9584074879704699, + "grad_norm": 0.05979987606406212, + "learning_rate": 0.0001042089985486212, + "loss": 0.0081, + "step": 3635 + }, + { + "epoch": 0.9597257926306769, + "grad_norm": 0.15761719644069672, + "learning_rate": 0.00010407705502045125, + "loss": 0.0061, + "step": 3640 + }, + { + "epoch": 0.9610440972908839, + "grad_norm": 0.6534290909767151, + "learning_rate": 0.0001039451114922813, + "loss": 0.0104, + "step": 3645 + }, + { + "epoch": 0.9623624019510909, + "grad_norm": 1.0324147939682007, + "learning_rate": 0.00010381316796411136, + "loss": 0.0381, + "step": 3650 + }, + { + "epoch": 0.9636807066112979, + "grad_norm": 0.002968872431665659, + "learning_rate": 0.00010368122443594142, + "loss": 0.0343, + "step": 3655 + }, + { + "epoch": 0.9649990112715049, + "grad_norm": 0.011243184097111225, + "learning_rate": 0.00010354928090777147, + "loss": 0.019, + "step": 3660 + }, + { + "epoch": 0.9663173159317118, + "grad_norm": 0.17663739621639252, + "learning_rate": 0.00010341733737960153, + "loss": 0.0452, + "step": 3665 + }, + { + "epoch": 0.9676356205919188, + "grad_norm": 1.2647719383239746, + "learning_rate": 0.00010328539385143158, + "loss": 0.0154, + "step": 3670 + }, + { + "epoch": 0.9689539252521258, + "grad_norm": 0.3691752552986145, + "learning_rate": 0.00010315345032326165, + "loss": 0.028, + "step": 3675 + }, + { + "epoch": 0.9702722299123328, + "grad_norm": 0.0015879774000495672, + "learning_rate": 0.00010302150679509171, + "loss": 0.0202, + "step": 3680 + }, + { + "epoch": 0.9715905345725397, + "grad_norm": 0.1441984623670578, + "learning_rate": 0.00010288956326692176, + "loss": 0.0221, + "step": 3685 + }, + { + "epoch": 0.9729088392327467, + "grad_norm": 0.20431455969810486, + "learning_rate": 0.00010275761973875182, + "loss": 0.0072, + "step": 3690 + }, + { + "epoch": 0.9742271438929536, + "grad_norm": 0.861625611782074, + "learning_rate": 0.00010262567621058187, + "loss": 0.0523, + "step": 3695 + }, + { + "epoch": 0.9755454485531606, + "grad_norm": 0.005049478262662888, + "learning_rate": 0.00010249373268241193, + "loss": 0.0051, + "step": 3700 + }, + { + "epoch": 0.9768637532133676, + "grad_norm": 0.49685510993003845, + "learning_rate": 0.000102361789154242, + "loss": 0.023, + "step": 3705 + }, + { + "epoch": 0.9781820578735746, + "grad_norm": 0.08789395540952682, + "learning_rate": 0.00010222984562607205, + "loss": 0.0159, + "step": 3710 + }, + { + "epoch": 0.9795003625337816, + "grad_norm": 0.027168691158294678, + "learning_rate": 0.00010209790209790211, + "loss": 0.0083, + "step": 3715 + }, + { + "epoch": 0.9808186671939886, + "grad_norm": 0.0006773864733986557, + "learning_rate": 0.00010196595856973217, + "loss": 0.0048, + "step": 3720 + }, + { + "epoch": 0.9821369718541955, + "grad_norm": 0.01636457070708275, + "learning_rate": 0.00010183401504156222, + "loss": 0.0159, + "step": 3725 + }, + { + "epoch": 0.9834552765144025, + "grad_norm": 0.10160859674215317, + "learning_rate": 0.00010170207151339228, + "loss": 0.0047, + "step": 3730 + }, + { + "epoch": 0.9847735811746094, + "grad_norm": 0.14173269271850586, + "learning_rate": 0.00010157012798522232, + "loss": 0.006, + "step": 3735 + }, + { + "epoch": 0.9860918858348164, + "grad_norm": 0.003458512481302023, + "learning_rate": 0.00010143818445705238, + "loss": 0.0193, + "step": 3740 + }, + { + "epoch": 0.9874101904950234, + "grad_norm": 0.005163820460438728, + "learning_rate": 0.00010130624092888243, + "loss": 0.0039, + "step": 3745 + }, + { + "epoch": 0.9887284951552304, + "grad_norm": 0.005913791712373495, + "learning_rate": 0.00010117429740071249, + "loss": 0.0119, + "step": 3750 + }, + { + "epoch": 0.9900467998154373, + "grad_norm": 0.00800853967666626, + "learning_rate": 0.00010104235387254256, + "loss": 0.044, + "step": 3755 + }, + { + "epoch": 0.9913651044756443, + "grad_norm": 0.18146778643131256, + "learning_rate": 0.00010091041034437261, + "loss": 0.0048, + "step": 3760 + }, + { + "epoch": 0.9926834091358513, + "grad_norm": 0.01235104724764824, + "learning_rate": 0.00010077846681620267, + "loss": 0.0017, + "step": 3765 + }, + { + "epoch": 0.9940017137960583, + "grad_norm": 0.17677897214889526, + "learning_rate": 0.00010064652328803272, + "loss": 0.0339, + "step": 3770 + }, + { + "epoch": 0.9953200184562653, + "grad_norm": 0.0017472271574661136, + "learning_rate": 0.00010051457975986278, + "loss": 0.0494, + "step": 3775 + }, + { + "epoch": 0.9966383231164723, + "grad_norm": 0.10814860463142395, + "learning_rate": 0.00010038263623169283, + "loss": 0.0741, + "step": 3780 + }, + { + "epoch": 0.9979566277766792, + "grad_norm": 0.11329760402441025, + "learning_rate": 0.0001002506927035229, + "loss": 0.0182, + "step": 3785 + }, + { + "epoch": 0.9992749324368861, + "grad_norm": 0.11573276668787003, + "learning_rate": 0.00010011874917535296, + "loss": 0.0068, + "step": 3790 + }, + { + "epoch": 1.000790982796124, + "grad_norm": 0.08449886739253998, + "learning_rate": 9.998680564718301e-05, + "loss": 0.0141, + "step": 3795 + }, + { + "epoch": 1.002109287456331, + "grad_norm": 0.05035184696316719, + "learning_rate": 9.985486211901307e-05, + "loss": 0.0293, + "step": 3800 + }, + { + "epoch": 1.003427592116538, + "grad_norm": 0.0255444198846817, + "learning_rate": 9.972291859084313e-05, + "loss": 0.0054, + "step": 3805 + }, + { + "epoch": 1.004745896776745, + "grad_norm": 0.0033677336759865284, + "learning_rate": 9.959097506267318e-05, + "loss": 0.0567, + "step": 3810 + }, + { + "epoch": 1.006064201436952, + "grad_norm": 0.09453682601451874, + "learning_rate": 9.945903153450324e-05, + "loss": 0.0589, + "step": 3815 + }, + { + "epoch": 1.007382506097159, + "grad_norm": 0.01592979207634926, + "learning_rate": 9.932708800633329e-05, + "loss": 0.0043, + "step": 3820 + }, + { + "epoch": 1.008700810757366, + "grad_norm": 0.002263693604618311, + "learning_rate": 9.919514447816335e-05, + "loss": 0.0195, + "step": 3825 + }, + { + "epoch": 1.010019115417573, + "grad_norm": 0.013390793465077877, + "learning_rate": 9.90632009499934e-05, + "loss": 0.0152, + "step": 3830 + }, + { + "epoch": 1.01133742007778, + "grad_norm": 0.10473847389221191, + "learning_rate": 9.893125742182346e-05, + "loss": 0.0606, + "step": 3835 + }, + { + "epoch": 1.012655724737987, + "grad_norm": 0.05837221071124077, + "learning_rate": 9.879931389365353e-05, + "loss": 0.0121, + "step": 3840 + }, + { + "epoch": 1.013974029398194, + "grad_norm": 0.3803791105747223, + "learning_rate": 9.866737036548358e-05, + "loss": 0.0386, + "step": 3845 + }, + { + "epoch": 1.0152923340584008, + "grad_norm": 0.4067519009113312, + "learning_rate": 9.853542683731364e-05, + "loss": 0.0115, + "step": 3850 + }, + { + "epoch": 1.0166106387186078, + "grad_norm": 0.02585229091346264, + "learning_rate": 9.84034833091437e-05, + "loss": 0.0214, + "step": 3855 + }, + { + "epoch": 1.0179289433788148, + "grad_norm": 0.03670825809240341, + "learning_rate": 9.827153978097374e-05, + "loss": 0.0059, + "step": 3860 + }, + { + "epoch": 1.0192472480390218, + "grad_norm": 0.014171554706990719, + "learning_rate": 9.81395962528038e-05, + "loss": 0.0145, + "step": 3865 + }, + { + "epoch": 1.0205655526992288, + "grad_norm": 0.027376385405659676, + "learning_rate": 9.800765272463386e-05, + "loss": 0.0089, + "step": 3870 + }, + { + "epoch": 1.0218838573594358, + "grad_norm": 0.03168405964970589, + "learning_rate": 9.787570919646392e-05, + "loss": 0.0132, + "step": 3875 + }, + { + "epoch": 1.0232021620196428, + "grad_norm": 0.03346199914813042, + "learning_rate": 9.774376566829397e-05, + "loss": 0.0246, + "step": 3880 + }, + { + "epoch": 1.0245204666798498, + "grad_norm": 0.00894144270569086, + "learning_rate": 9.761182214012403e-05, + "loss": 0.0105, + "step": 3885 + }, + { + "epoch": 1.0258387713400567, + "grad_norm": 0.3172806203365326, + "learning_rate": 9.747987861195409e-05, + "loss": 0.0103, + "step": 3890 + }, + { + "epoch": 1.0271570760002637, + "grad_norm": 0.009055040776729584, + "learning_rate": 9.734793508378414e-05, + "loss": 0.0103, + "step": 3895 + }, + { + "epoch": 1.0284753806604707, + "grad_norm": 0.014140011742711067, + "learning_rate": 9.721599155561421e-05, + "loss": 0.0037, + "step": 3900 + }, + { + "epoch": 1.0297936853206777, + "grad_norm": 0.008317383006215096, + "learning_rate": 9.708404802744427e-05, + "loss": 0.002, + "step": 3905 + }, + { + "epoch": 1.0311119899808845, + "grad_norm": 0.005038558971136808, + "learning_rate": 9.695210449927431e-05, + "loss": 0.0017, + "step": 3910 + }, + { + "epoch": 1.0324302946410915, + "grad_norm": 0.40058520436286926, + "learning_rate": 9.682016097110436e-05, + "loss": 0.0065, + "step": 3915 + }, + { + "epoch": 1.0337485993012985, + "grad_norm": 0.005197151098400354, + "learning_rate": 9.668821744293442e-05, + "loss": 0.0031, + "step": 3920 + }, + { + "epoch": 1.0350669039615055, + "grad_norm": 0.014353781007230282, + "learning_rate": 9.655627391476449e-05, + "loss": 0.0009, + "step": 3925 + }, + { + "epoch": 1.0363852086217125, + "grad_norm": 0.13260559737682343, + "learning_rate": 9.642433038659454e-05, + "loss": 0.0323, + "step": 3930 + }, + { + "epoch": 1.0377035132819195, + "grad_norm": 0.006795065477490425, + "learning_rate": 9.62923868584246e-05, + "loss": 0.0022, + "step": 3935 + }, + { + "epoch": 1.0390218179421264, + "grad_norm": 0.2276086062192917, + "learning_rate": 9.616044333025466e-05, + "loss": 0.0221, + "step": 3940 + }, + { + "epoch": 1.0403401226023334, + "grad_norm": 0.06121920794248581, + "learning_rate": 9.602849980208471e-05, + "loss": 0.0037, + "step": 3945 + }, + { + "epoch": 1.0416584272625404, + "grad_norm": 0.9180755019187927, + "learning_rate": 9.589655627391477e-05, + "loss": 0.0589, + "step": 3950 + }, + { + "epoch": 1.0429767319227474, + "grad_norm": 0.07515591382980347, + "learning_rate": 9.576461274574484e-05, + "loss": 0.0653, + "step": 3955 + }, + { + "epoch": 1.0442950365829544, + "grad_norm": 0.018060607835650444, + "learning_rate": 9.563266921757488e-05, + "loss": 0.0178, + "step": 3960 + }, + { + "epoch": 1.0456133412431612, + "grad_norm": 0.02751368284225464, + "learning_rate": 9.550072568940493e-05, + "loss": 0.0076, + "step": 3965 + }, + { + "epoch": 1.0469316459033682, + "grad_norm": 0.653998613357544, + "learning_rate": 9.536878216123499e-05, + "loss": 0.0066, + "step": 3970 + }, + { + "epoch": 1.0482499505635752, + "grad_norm": 0.3117768168449402, + "learning_rate": 9.523683863306505e-05, + "loss": 0.0087, + "step": 3975 + }, + { + "epoch": 1.0495682552237822, + "grad_norm": 0.013952831737697124, + "learning_rate": 9.510489510489511e-05, + "loss": 0.0037, + "step": 3980 + }, + { + "epoch": 1.0508865598839892, + "grad_norm": 0.01806250400841236, + "learning_rate": 9.497295157672517e-05, + "loss": 0.0028, + "step": 3985 + }, + { + "epoch": 1.0522048645441962, + "grad_norm": 0.13678006827831268, + "learning_rate": 9.484100804855523e-05, + "loss": 0.0533, + "step": 3990 + }, + { + "epoch": 1.0535231692044031, + "grad_norm": 0.14869382977485657, + "learning_rate": 9.470906452038528e-05, + "loss": 0.009, + "step": 3995 + }, + { + "epoch": 1.0548414738646101, + "grad_norm": 0.33614659309387207, + "learning_rate": 9.457712099221534e-05, + "loss": 0.0555, + "step": 4000 + }, + { + "epoch": 1.0548414738646101, + "eval_loss": 0.026165226474404335, + "eval_runtime": 452.2482, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 3.728, + "step": 4000 + }, + { + "epoch": 1.0561597785248171, + "grad_norm": 0.007546027656644583, + "learning_rate": 9.444517746404539e-05, + "loss": 0.0029, + "step": 4005 + }, + { + "epoch": 1.0574780831850241, + "grad_norm": 0.3720332384109497, + "learning_rate": 9.431323393587545e-05, + "loss": 0.0353, + "step": 4010 + }, + { + "epoch": 1.0587963878452311, + "grad_norm": 1.1335264444351196, + "learning_rate": 9.41812904077055e-05, + "loss": 0.0142, + "step": 4015 + }, + { + "epoch": 1.060114692505438, + "grad_norm": 0.024723488837480545, + "learning_rate": 9.404934687953556e-05, + "loss": 0.006, + "step": 4020 + }, + { + "epoch": 1.0614329971656449, + "grad_norm": 0.040354058146476746, + "learning_rate": 9.391740335136562e-05, + "loss": 0.0107, + "step": 4025 + }, + { + "epoch": 1.0627513018258519, + "grad_norm": 0.222810298204422, + "learning_rate": 9.378545982319567e-05, + "loss": 0.0273, + "step": 4030 + }, + { + "epoch": 1.0640696064860589, + "grad_norm": 0.025684095919132233, + "learning_rate": 9.365351629502574e-05, + "loss": 0.0033, + "step": 4035 + }, + { + "epoch": 1.0653879111462659, + "grad_norm": 0.05338352546095848, + "learning_rate": 9.35215727668558e-05, + "loss": 0.0052, + "step": 4040 + }, + { + "epoch": 1.0667062158064728, + "grad_norm": 0.06182330474257469, + "learning_rate": 9.338962923868585e-05, + "loss": 0.0038, + "step": 4045 + }, + { + "epoch": 1.0680245204666798, + "grad_norm": 0.012170832604169846, + "learning_rate": 9.325768571051591e-05, + "loss": 0.0018, + "step": 4050 + }, + { + "epoch": 1.0693428251268868, + "grad_norm": 0.5424306392669678, + "learning_rate": 9.312574218234596e-05, + "loss": 0.0445, + "step": 4055 + }, + { + "epoch": 1.0706611297870938, + "grad_norm": 0.017939254641532898, + "learning_rate": 9.299379865417602e-05, + "loss": 0.0389, + "step": 4060 + }, + { + "epoch": 1.0719794344473008, + "grad_norm": 0.0060431682504713535, + "learning_rate": 9.286185512600607e-05, + "loss": 0.0025, + "step": 4065 + }, + { + "epoch": 1.0732977391075078, + "grad_norm": 0.0071444883942604065, + "learning_rate": 9.272991159783613e-05, + "loss": 0.0333, + "step": 4070 + }, + { + "epoch": 1.0746160437677148, + "grad_norm": 0.29632750153541565, + "learning_rate": 9.259796806966619e-05, + "loss": 0.0151, + "step": 4075 + }, + { + "epoch": 1.0759343484279218, + "grad_norm": 0.004526323173195124, + "learning_rate": 9.246602454149624e-05, + "loss": 0.006, + "step": 4080 + }, + { + "epoch": 1.0772526530881286, + "grad_norm": 0.023945212364196777, + "learning_rate": 9.23340810133263e-05, + "loss": 0.004, + "step": 4085 + }, + { + "epoch": 1.0785709577483356, + "grad_norm": 0.13235126435756683, + "learning_rate": 9.220213748515635e-05, + "loss": 0.0059, + "step": 4090 + }, + { + "epoch": 1.0798892624085425, + "grad_norm": 0.17592330276966095, + "learning_rate": 9.207019395698642e-05, + "loss": 0.0302, + "step": 4095 + }, + { + "epoch": 1.0812075670687495, + "grad_norm": 0.004582866560667753, + "learning_rate": 9.193825042881648e-05, + "loss": 0.009, + "step": 4100 + }, + { + "epoch": 1.0825258717289565, + "grad_norm": 0.15214525163173676, + "learning_rate": 9.180630690064653e-05, + "loss": 0.0062, + "step": 4105 + }, + { + "epoch": 1.0838441763891635, + "grad_norm": 0.16535983979701996, + "learning_rate": 9.167436337247658e-05, + "loss": 0.0926, + "step": 4110 + }, + { + "epoch": 1.0851624810493705, + "grad_norm": 0.013285227119922638, + "learning_rate": 9.154241984430663e-05, + "loss": 0.0043, + "step": 4115 + }, + { + "epoch": 1.0864807857095775, + "grad_norm": 0.012116984464228153, + "learning_rate": 9.14104763161367e-05, + "loss": 0.0037, + "step": 4120 + }, + { + "epoch": 1.0877990903697845, + "grad_norm": 0.0373845212161541, + "learning_rate": 9.127853278796676e-05, + "loss": 0.0081, + "step": 4125 + }, + { + "epoch": 1.0891173950299915, + "grad_norm": 0.09324615448713303, + "learning_rate": 9.114658925979681e-05, + "loss": 0.0534, + "step": 4130 + }, + { + "epoch": 1.0904356996901985, + "grad_norm": 0.010992968454957008, + "learning_rate": 9.101464573162687e-05, + "loss": 0.0025, + "step": 4135 + }, + { + "epoch": 1.0917540043504055, + "grad_norm": 0.13710318505764008, + "learning_rate": 9.088270220345692e-05, + "loss": 0.0555, + "step": 4140 + }, + { + "epoch": 1.0930723090106123, + "grad_norm": 0.010403074324131012, + "learning_rate": 9.075075867528698e-05, + "loss": 0.0042, + "step": 4145 + }, + { + "epoch": 1.0943906136708192, + "grad_norm": 0.21544460952281952, + "learning_rate": 9.061881514711705e-05, + "loss": 0.0144, + "step": 4150 + }, + { + "epoch": 1.0957089183310262, + "grad_norm": 0.04194799065589905, + "learning_rate": 9.04868716189471e-05, + "loss": 0.0106, + "step": 4155 + }, + { + "epoch": 1.0970272229912332, + "grad_norm": 0.029204202815890312, + "learning_rate": 9.035492809077715e-05, + "loss": 0.0085, + "step": 4160 + }, + { + "epoch": 1.0983455276514402, + "grad_norm": 0.006751026958227158, + "learning_rate": 9.02229845626072e-05, + "loss": 0.0049, + "step": 4165 + }, + { + "epoch": 1.0996638323116472, + "grad_norm": 0.008232722990214825, + "learning_rate": 9.009104103443726e-05, + "loss": 0.0172, + "step": 4170 + }, + { + "epoch": 1.1009821369718542, + "grad_norm": 0.05630079656839371, + "learning_rate": 8.995909750626733e-05, + "loss": 0.0112, + "step": 4175 + }, + { + "epoch": 1.1023004416320612, + "grad_norm": 0.0011601662263274193, + "learning_rate": 8.982715397809738e-05, + "loss": 0.0317, + "step": 4180 + }, + { + "epoch": 1.1036187462922682, + "grad_norm": 0.006554402410984039, + "learning_rate": 8.969521044992744e-05, + "loss": 0.0035, + "step": 4185 + }, + { + "epoch": 1.1049370509524752, + "grad_norm": 0.34513652324676514, + "learning_rate": 8.956326692175749e-05, + "loss": 0.0036, + "step": 4190 + }, + { + "epoch": 1.1062553556126822, + "grad_norm": 0.283669650554657, + "learning_rate": 8.943132339358755e-05, + "loss": 0.0182, + "step": 4195 + }, + { + "epoch": 1.1075736602728892, + "grad_norm": 0.5376952290534973, + "learning_rate": 8.92993798654176e-05, + "loss": 0.0293, + "step": 4200 + }, + { + "epoch": 1.108891964933096, + "grad_norm": 0.01689724065363407, + "learning_rate": 8.916743633724767e-05, + "loss": 0.0206, + "step": 4205 + }, + { + "epoch": 1.110210269593303, + "grad_norm": 0.026538770645856857, + "learning_rate": 8.903549280907772e-05, + "loss": 0.0181, + "step": 4210 + }, + { + "epoch": 1.11152857425351, + "grad_norm": 0.6372873783111572, + "learning_rate": 8.890354928090777e-05, + "loss": 0.021, + "step": 4215 + }, + { + "epoch": 1.112846878913717, + "grad_norm": 0.06177428737282753, + "learning_rate": 8.877160575273783e-05, + "loss": 0.0033, + "step": 4220 + }, + { + "epoch": 1.114165183573924, + "grad_norm": 0.3712109923362732, + "learning_rate": 8.863966222456788e-05, + "loss": 0.0075, + "step": 4225 + }, + { + "epoch": 1.115483488234131, + "grad_norm": 0.030514653772115707, + "learning_rate": 8.850771869639795e-05, + "loss": 0.0183, + "step": 4230 + }, + { + "epoch": 1.116801792894338, + "grad_norm": 0.012861707247793674, + "learning_rate": 8.837577516822801e-05, + "loss": 0.0032, + "step": 4235 + }, + { + "epoch": 1.118120097554545, + "grad_norm": 0.3278522789478302, + "learning_rate": 8.824383164005806e-05, + "loss": 0.0058, + "step": 4240 + }, + { + "epoch": 1.1194384022147519, + "grad_norm": 0.580259382724762, + "learning_rate": 8.811188811188812e-05, + "loss": 0.0068, + "step": 4245 + }, + { + "epoch": 1.1207567068749589, + "grad_norm": 0.007002575788646936, + "learning_rate": 8.797994458371817e-05, + "loss": 0.0063, + "step": 4250 + }, + { + "epoch": 1.1220750115351659, + "grad_norm": 0.22484643757343292, + "learning_rate": 8.784800105554823e-05, + "loss": 0.0167, + "step": 4255 + }, + { + "epoch": 1.1233933161953726, + "grad_norm": 0.004122686106711626, + "learning_rate": 8.771605752737829e-05, + "loss": 0.002, + "step": 4260 + }, + { + "epoch": 1.1247116208555796, + "grad_norm": 0.009832561016082764, + "learning_rate": 8.758411399920834e-05, + "loss": 0.0029, + "step": 4265 + }, + { + "epoch": 1.1260299255157866, + "grad_norm": 0.04854527860879898, + "learning_rate": 8.74521704710384e-05, + "loss": 0.0068, + "step": 4270 + }, + { + "epoch": 1.1273482301759936, + "grad_norm": 0.12221235036849976, + "learning_rate": 8.732022694286845e-05, + "loss": 0.003, + "step": 4275 + }, + { + "epoch": 1.1286665348362006, + "grad_norm": 0.005857539363205433, + "learning_rate": 8.718828341469851e-05, + "loss": 0.0022, + "step": 4280 + }, + { + "epoch": 1.1299848394964076, + "grad_norm": 0.10582758486270905, + "learning_rate": 8.705633988652856e-05, + "loss": 0.002, + "step": 4285 + }, + { + "epoch": 1.1313031441566146, + "grad_norm": 0.006190940272063017, + "learning_rate": 8.692439635835863e-05, + "loss": 0.0022, + "step": 4290 + }, + { + "epoch": 1.1326214488168216, + "grad_norm": 0.00221514655277133, + "learning_rate": 8.679245283018869e-05, + "loss": 0.0314, + "step": 4295 + }, + { + "epoch": 1.1339397534770286, + "grad_norm": 0.0796755850315094, + "learning_rate": 8.666050930201874e-05, + "loss": 0.0347, + "step": 4300 + }, + { + "epoch": 1.1352580581372356, + "grad_norm": 0.20088806748390198, + "learning_rate": 8.65285657738488e-05, + "loss": 0.0048, + "step": 4305 + }, + { + "epoch": 1.1365763627974426, + "grad_norm": 0.4018377363681793, + "learning_rate": 8.639662224567884e-05, + "loss": 0.0234, + "step": 4310 + }, + { + "epoch": 1.1378946674576496, + "grad_norm": 0.014961684122681618, + "learning_rate": 8.626467871750891e-05, + "loss": 0.0033, + "step": 4315 + }, + { + "epoch": 1.1392129721178565, + "grad_norm": 0.004534922540187836, + "learning_rate": 8.613273518933897e-05, + "loss": 0.0021, + "step": 4320 + }, + { + "epoch": 1.1405312767780633, + "grad_norm": 0.06340984255075455, + "learning_rate": 8.600079166116902e-05, + "loss": 0.0538, + "step": 4325 + }, + { + "epoch": 1.1418495814382703, + "grad_norm": 0.007374623324722052, + "learning_rate": 8.586884813299908e-05, + "loss": 0.0157, + "step": 4330 + }, + { + "epoch": 1.1431678860984773, + "grad_norm": 0.02313193492591381, + "learning_rate": 8.573690460482913e-05, + "loss": 0.0307, + "step": 4335 + }, + { + "epoch": 1.1444861907586843, + "grad_norm": 0.014071634039282799, + "learning_rate": 8.560496107665919e-05, + "loss": 0.0058, + "step": 4340 + }, + { + "epoch": 1.1458044954188913, + "grad_norm": 1.4664901494979858, + "learning_rate": 8.547301754848926e-05, + "loss": 0.0566, + "step": 4345 + }, + { + "epoch": 1.1471228000790983, + "grad_norm": 0.023680074140429497, + "learning_rate": 8.534107402031931e-05, + "loss": 0.0048, + "step": 4350 + }, + { + "epoch": 1.1484411047393053, + "grad_norm": 0.012555698864161968, + "learning_rate": 8.520913049214937e-05, + "loss": 0.0076, + "step": 4355 + }, + { + "epoch": 1.1497594093995123, + "grad_norm": 0.013624129816889763, + "learning_rate": 8.507718696397941e-05, + "loss": 0.0373, + "step": 4360 + }, + { + "epoch": 1.1510777140597193, + "grad_norm": 0.015372387133538723, + "learning_rate": 8.494524343580947e-05, + "loss": 0.0147, + "step": 4365 + }, + { + "epoch": 1.1523960187199263, + "grad_norm": 0.3312993347644806, + "learning_rate": 8.481329990763954e-05, + "loss": 0.0299, + "step": 4370 + }, + { + "epoch": 1.1537143233801332, + "grad_norm": 0.023838184773921967, + "learning_rate": 8.468135637946959e-05, + "loss": 0.0226, + "step": 4375 + }, + { + "epoch": 1.15503262804034, + "grad_norm": 0.42516952753067017, + "learning_rate": 8.454941285129965e-05, + "loss": 0.0088, + "step": 4380 + }, + { + "epoch": 1.156350932700547, + "grad_norm": 0.6900278925895691, + "learning_rate": 8.44174693231297e-05, + "loss": 0.0245, + "step": 4385 + }, + { + "epoch": 1.157669237360754, + "grad_norm": 0.2932703197002411, + "learning_rate": 8.428552579495976e-05, + "loss": 0.0207, + "step": 4390 + }, + { + "epoch": 1.158987542020961, + "grad_norm": 0.12942780554294586, + "learning_rate": 8.415358226678982e-05, + "loss": 0.0037, + "step": 4395 + }, + { + "epoch": 1.160305846681168, + "grad_norm": 0.9499046802520752, + "learning_rate": 8.402163873861989e-05, + "loss": 0.0246, + "step": 4400 + }, + { + "epoch": 1.161624151341375, + "grad_norm": 0.008869118988513947, + "learning_rate": 8.388969521044994e-05, + "loss": 0.0171, + "step": 4405 + }, + { + "epoch": 1.162942456001582, + "grad_norm": 1.7409231662750244, + "learning_rate": 8.375775168227998e-05, + "loss": 0.017, + "step": 4410 + }, + { + "epoch": 1.164260760661789, + "grad_norm": 0.0020101398695260286, + "learning_rate": 8.362580815411004e-05, + "loss": 0.0027, + "step": 4415 + }, + { + "epoch": 1.165579065321996, + "grad_norm": 0.0785067081451416, + "learning_rate": 8.34938646259401e-05, + "loss": 0.0043, + "step": 4420 + }, + { + "epoch": 1.166897369982203, + "grad_norm": 0.0029506285209208727, + "learning_rate": 8.336192109777016e-05, + "loss": 0.0109, + "step": 4425 + }, + { + "epoch": 1.16821567464241, + "grad_norm": 0.02216683328151703, + "learning_rate": 8.322997756960022e-05, + "loss": 0.0026, + "step": 4430 + }, + { + "epoch": 1.1695339793026167, + "grad_norm": 0.02216639369726181, + "learning_rate": 8.309803404143027e-05, + "loss": 0.0045, + "step": 4435 + }, + { + "epoch": 1.170852283962824, + "grad_norm": 0.0, + "learning_rate": 8.296609051326033e-05, + "loss": 0.006, + "step": 4440 + }, + { + "epoch": 1.1721705886230307, + "grad_norm": 0.0019736960530281067, + "learning_rate": 8.283414698509039e-05, + "loss": 0.0078, + "step": 4445 + }, + { + "epoch": 1.1734888932832377, + "grad_norm": 0.012957746163010597, + "learning_rate": 8.270220345692044e-05, + "loss": 0.002, + "step": 4450 + }, + { + "epoch": 1.1748071979434447, + "grad_norm": 0.010877869091928005, + "learning_rate": 8.25702599287505e-05, + "loss": 0.0237, + "step": 4455 + }, + { + "epoch": 1.1761255026036517, + "grad_norm": 0.005947659723460674, + "learning_rate": 8.243831640058055e-05, + "loss": 0.0341, + "step": 4460 + }, + { + "epoch": 1.1774438072638587, + "grad_norm": 0.0005026470171287656, + "learning_rate": 8.230637287241061e-05, + "loss": 0.0033, + "step": 4465 + }, + { + "epoch": 1.1787621119240657, + "grad_norm": 0.022054588422179222, + "learning_rate": 8.217442934424066e-05, + "loss": 0.0042, + "step": 4470 + }, + { + "epoch": 1.1800804165842727, + "grad_norm": 0.7929030656814575, + "learning_rate": 8.204248581607072e-05, + "loss": 0.0076, + "step": 4475 + }, + { + "epoch": 1.1813987212444796, + "grad_norm": 0.39052629470825195, + "learning_rate": 8.191054228790078e-05, + "loss": 0.0228, + "step": 4480 + }, + { + "epoch": 1.1827170259046866, + "grad_norm": 0.007177622988820076, + "learning_rate": 8.177859875973084e-05, + "loss": 0.01, + "step": 4485 + }, + { + "epoch": 1.1840353305648936, + "grad_norm": 0.006175135262310505, + "learning_rate": 8.16466552315609e-05, + "loss": 0.0037, + "step": 4490 + }, + { + "epoch": 1.1853536352251006, + "grad_norm": 0.0356481671333313, + "learning_rate": 8.151471170339096e-05, + "loss": 0.0024, + "step": 4495 + }, + { + "epoch": 1.1866719398853074, + "grad_norm": 0.19069480895996094, + "learning_rate": 8.138276817522101e-05, + "loss": 0.0048, + "step": 4500 + }, + { + "epoch": 1.1866719398853074, + "eval_loss": 0.026386437937617302, + "eval_runtime": 452.2896, + "eval_samples_per_second": 7.455, + "eval_steps_per_second": 3.728, + "step": 4500 + }, + { + "epoch": 1.1879902445455144, + "grad_norm": 0.002254961524158716, + "learning_rate": 8.125082464705107e-05, + "loss": 0.0014, + "step": 4505 + }, + { + "epoch": 1.1893085492057214, + "grad_norm": 0.8026870489120483, + "learning_rate": 8.111888111888112e-05, + "loss": 0.0411, + "step": 4510 + }, + { + "epoch": 1.1906268538659284, + "grad_norm": 0.47328072786331177, + "learning_rate": 8.098693759071118e-05, + "loss": 0.0271, + "step": 4515 + }, + { + "epoch": 1.1919451585261354, + "grad_norm": 0.4888288676738739, + "learning_rate": 8.085499406254123e-05, + "loss": 0.039, + "step": 4520 + }, + { + "epoch": 1.1932634631863424, + "grad_norm": 0.000925812462810427, + "learning_rate": 8.072305053437129e-05, + "loss": 0.0461, + "step": 4525 + }, + { + "epoch": 1.1945817678465493, + "grad_norm": 0.12472371757030487, + "learning_rate": 8.059110700620135e-05, + "loss": 0.0037, + "step": 4530 + }, + { + "epoch": 1.1959000725067563, + "grad_norm": 0.002875336678698659, + "learning_rate": 8.04591634780314e-05, + "loss": 0.0425, + "step": 4535 + }, + { + "epoch": 1.1972183771669633, + "grad_norm": 0.042056187987327576, + "learning_rate": 8.032721994986147e-05, + "loss": 0.0068, + "step": 4540 + }, + { + "epoch": 1.1985366818271703, + "grad_norm": 0.157605841755867, + "learning_rate": 8.019527642169153e-05, + "loss": 0.0179, + "step": 4545 + }, + { + "epoch": 1.1998549864873773, + "grad_norm": 0.005153563339263201, + "learning_rate": 8.006333289352158e-05, + "loss": 0.0045, + "step": 4550 + }, + { + "epoch": 1.201173291147584, + "grad_norm": 0.02541598491370678, + "learning_rate": 7.993138936535164e-05, + "loss": 0.0041, + "step": 4555 + }, + { + "epoch": 1.2024915958077913, + "grad_norm": 0.04266195371747017, + "learning_rate": 7.979944583718168e-05, + "loss": 0.0121, + "step": 4560 + }, + { + "epoch": 1.203809900467998, + "grad_norm": 0.36108532547950745, + "learning_rate": 7.966750230901175e-05, + "loss": 0.0147, + "step": 4565 + }, + { + "epoch": 1.205128205128205, + "grad_norm": 0.40405452251434326, + "learning_rate": 7.95355587808418e-05, + "loss": 0.0056, + "step": 4570 + }, + { + "epoch": 1.206446509788412, + "grad_norm": 0.030422702431678772, + "learning_rate": 7.940361525267186e-05, + "loss": 0.0055, + "step": 4575 + }, + { + "epoch": 1.207764814448619, + "grad_norm": 0.014555396512150764, + "learning_rate": 7.927167172450192e-05, + "loss": 0.0029, + "step": 4580 + }, + { + "epoch": 1.209083119108826, + "grad_norm": 0.33962950110435486, + "learning_rate": 7.913972819633197e-05, + "loss": 0.0191, + "step": 4585 + }, + { + "epoch": 1.210401423769033, + "grad_norm": 0.040150560438632965, + "learning_rate": 7.900778466816203e-05, + "loss": 0.0096, + "step": 4590 + }, + { + "epoch": 1.21171972842924, + "grad_norm": 0.2968510091304779, + "learning_rate": 7.88758411399921e-05, + "loss": 0.0311, + "step": 4595 + }, + { + "epoch": 1.213038033089447, + "grad_norm": 0.04709814116358757, + "learning_rate": 7.874389761182215e-05, + "loss": 0.0175, + "step": 4600 + }, + { + "epoch": 1.214356337749654, + "grad_norm": 0.1379537284374237, + "learning_rate": 7.861195408365221e-05, + "loss": 0.02, + "step": 4605 + }, + { + "epoch": 1.215674642409861, + "grad_norm": 0.018291711807250977, + "learning_rate": 7.848001055548225e-05, + "loss": 0.003, + "step": 4610 + }, + { + "epoch": 1.216992947070068, + "grad_norm": 0.041676126420497894, + "learning_rate": 7.83480670273123e-05, + "loss": 0.0054, + "step": 4615 + }, + { + "epoch": 1.2183112517302748, + "grad_norm": 0.0013747498160228133, + "learning_rate": 7.821612349914237e-05, + "loss": 0.0132, + "step": 4620 + }, + { + "epoch": 1.2196295563904818, + "grad_norm": 0.0050489697605371475, + "learning_rate": 7.808417997097243e-05, + "loss": 0.0272, + "step": 4625 + }, + { + "epoch": 1.2209478610506888, + "grad_norm": 0.017974581569433212, + "learning_rate": 7.795223644280249e-05, + "loss": 0.0037, + "step": 4630 + }, + { + "epoch": 1.2222661657108957, + "grad_norm": 0.001916698063723743, + "learning_rate": 7.782029291463254e-05, + "loss": 0.002, + "step": 4635 + }, + { + "epoch": 1.2235844703711027, + "grad_norm": 0.05344574153423309, + "learning_rate": 7.76883493864626e-05, + "loss": 0.0114, + "step": 4640 + }, + { + "epoch": 1.2249027750313097, + "grad_norm": 0.22823786735534668, + "learning_rate": 7.755640585829265e-05, + "loss": 0.0296, + "step": 4645 + }, + { + "epoch": 1.2262210796915167, + "grad_norm": 0.02051074244081974, + "learning_rate": 7.742446233012272e-05, + "loss": 0.0037, + "step": 4650 + }, + { + "epoch": 1.2275393843517237, + "grad_norm": 0.9797061681747437, + "learning_rate": 7.729251880195276e-05, + "loss": 0.011, + "step": 4655 + }, + { + "epoch": 1.2288576890119307, + "grad_norm": 0.0017285927897319198, + "learning_rate": 7.716057527378282e-05, + "loss": 0.0224, + "step": 4660 + }, + { + "epoch": 1.2301759936721377, + "grad_norm": 0.021783018484711647, + "learning_rate": 7.702863174561288e-05, + "loss": 0.0174, + "step": 4665 + }, + { + "epoch": 1.2314942983323447, + "grad_norm": 0.00763307698071003, + "learning_rate": 7.689668821744293e-05, + "loss": 0.0516, + "step": 4670 + }, + { + "epoch": 1.2328126029925515, + "grad_norm": 0.32605209946632385, + "learning_rate": 7.676474468927299e-05, + "loss": 0.0301, + "step": 4675 + }, + { + "epoch": 1.2341309076527585, + "grad_norm": 1.2027722597122192, + "learning_rate": 7.663280116110306e-05, + "loss": 0.0474, + "step": 4680 + }, + { + "epoch": 1.2354492123129655, + "grad_norm": 0.10201717168092728, + "learning_rate": 7.650085763293311e-05, + "loss": 0.0144, + "step": 4685 + }, + { + "epoch": 1.2367675169731724, + "grad_norm": 0.013835664838552475, + "learning_rate": 7.636891410476317e-05, + "loss": 0.0024, + "step": 4690 + }, + { + "epoch": 1.2380858216333794, + "grad_norm": 0.005699916277080774, + "learning_rate": 7.623697057659322e-05, + "loss": 0.0089, + "step": 4695 + }, + { + "epoch": 1.2394041262935864, + "grad_norm": 0.16583332419395447, + "learning_rate": 7.610502704842328e-05, + "loss": 0.019, + "step": 4700 + }, + { + "epoch": 1.2407224309537934, + "grad_norm": 0.2734023332595825, + "learning_rate": 7.597308352025333e-05, + "loss": 0.0041, + "step": 4705 + }, + { + "epoch": 1.2420407356140004, + "grad_norm": 0.04209504276514053, + "learning_rate": 7.584113999208339e-05, + "loss": 0.0292, + "step": 4710 + }, + { + "epoch": 1.2433590402742074, + "grad_norm": 0.0303195733577013, + "learning_rate": 7.570919646391345e-05, + "loss": 0.0019, + "step": 4715 + }, + { + "epoch": 1.2446773449344144, + "grad_norm": 0.014011899940669537, + "learning_rate": 7.55772529357435e-05, + "loss": 0.0236, + "step": 4720 + }, + { + "epoch": 1.2459956495946214, + "grad_norm": 0.37838876247406006, + "learning_rate": 7.544530940757356e-05, + "loss": 0.0081, + "step": 4725 + }, + { + "epoch": 1.2473139542548284, + "grad_norm": 0.003717717481777072, + "learning_rate": 7.531336587940361e-05, + "loss": 0.0036, + "step": 4730 + }, + { + "epoch": 1.2486322589150354, + "grad_norm": 1.2284752130508423, + "learning_rate": 7.518142235123368e-05, + "loss": 0.0089, + "step": 4735 + }, + { + "epoch": 1.2499505635752421, + "grad_norm": 0.015356095507740974, + "learning_rate": 7.504947882306374e-05, + "loss": 0.0074, + "step": 4740 + }, + { + "epoch": 1.2512688682354491, + "grad_norm": 0.0020383282098919153, + "learning_rate": 7.49175352948938e-05, + "loss": 0.0444, + "step": 4745 + }, + { + "epoch": 1.2525871728956561, + "grad_norm": 0.006680132355540991, + "learning_rate": 7.478559176672385e-05, + "loss": 0.009, + "step": 4750 + }, + { + "epoch": 1.2539054775558631, + "grad_norm": 0.01650019735097885, + "learning_rate": 7.465364823855389e-05, + "loss": 0.0022, + "step": 4755 + }, + { + "epoch": 1.2552237822160701, + "grad_norm": 0.009536102414131165, + "learning_rate": 7.452170471038396e-05, + "loss": 0.0026, + "step": 4760 + }, + { + "epoch": 1.256542086876277, + "grad_norm": 0.04677430912852287, + "learning_rate": 7.438976118221402e-05, + "loss": 0.004, + "step": 4765 + }, + { + "epoch": 1.257860391536484, + "grad_norm": 0.007777783088386059, + "learning_rate": 7.425781765404407e-05, + "loss": 0.0112, + "step": 4770 + }, + { + "epoch": 1.259178696196691, + "grad_norm": 0.03724197298288345, + "learning_rate": 7.412587412587413e-05, + "loss": 0.0065, + "step": 4775 + }, + { + "epoch": 1.260497000856898, + "grad_norm": 0.0023958412930369377, + "learning_rate": 7.399393059770418e-05, + "loss": 0.0238, + "step": 4780 + }, + { + "epoch": 1.261815305517105, + "grad_norm": 0.0036889975890517235, + "learning_rate": 7.386198706953424e-05, + "loss": 0.0012, + "step": 4785 + }, + { + "epoch": 1.263133610177312, + "grad_norm": 0.0009220903157256544, + "learning_rate": 7.373004354136431e-05, + "loss": 0.0017, + "step": 4790 + }, + { + "epoch": 1.2644519148375188, + "grad_norm": 0.0033395602367818356, + "learning_rate": 7.359810001319436e-05, + "loss": 0.0474, + "step": 4795 + }, + { + "epoch": 1.265770219497726, + "grad_norm": 0.004093261435627937, + "learning_rate": 7.346615648502442e-05, + "loss": 0.0025, + "step": 4800 + }, + { + "epoch": 1.2670885241579328, + "grad_norm": 0.004395488649606705, + "learning_rate": 7.333421295685446e-05, + "loss": 0.0011, + "step": 4805 + }, + { + "epoch": 1.2684068288181398, + "grad_norm": 0.024034051224589348, + "learning_rate": 7.320226942868452e-05, + "loss": 0.0027, + "step": 4810 + }, + { + "epoch": 1.2697251334783468, + "grad_norm": 0.9501499533653259, + "learning_rate": 7.307032590051459e-05, + "loss": 0.0279, + "step": 4815 + }, + { + "epoch": 1.2710434381385538, + "grad_norm": 0.008805549703538418, + "learning_rate": 7.293838237234464e-05, + "loss": 0.0403, + "step": 4820 + }, + { + "epoch": 1.2723617427987608, + "grad_norm": 0.01750873774290085, + "learning_rate": 7.28064388441747e-05, + "loss": 0.0571, + "step": 4825 + }, + { + "epoch": 1.2736800474589678, + "grad_norm": 0.004490260500460863, + "learning_rate": 7.267449531600475e-05, + "loss": 0.0269, + "step": 4830 + }, + { + "epoch": 1.2749983521191748, + "grad_norm": 0.07510064542293549, + "learning_rate": 7.254255178783481e-05, + "loss": 0.0123, + "step": 4835 + }, + { + "epoch": 1.2763166567793818, + "grad_norm": 0.039783038198947906, + "learning_rate": 7.241060825966486e-05, + "loss": 0.0137, + "step": 4840 + }, + { + "epoch": 1.2776349614395888, + "grad_norm": 0.019004900008440018, + "learning_rate": 7.227866473149493e-05, + "loss": 0.0047, + "step": 4845 + }, + { + "epoch": 1.2789532660997955, + "grad_norm": 0.04813052713871002, + "learning_rate": 7.214672120332499e-05, + "loss": 0.0021, + "step": 4850 + }, + { + "epoch": 1.2802715707600028, + "grad_norm": 0.00835048221051693, + "learning_rate": 7.201477767515503e-05, + "loss": 0.0014, + "step": 4855 + }, + { + "epoch": 1.2815898754202095, + "grad_norm": 0.008609198965132236, + "learning_rate": 7.188283414698509e-05, + "loss": 0.0219, + "step": 4860 + }, + { + "epoch": 1.2829081800804165, + "grad_norm": 0.007337458431720734, + "learning_rate": 7.175089061881514e-05, + "loss": 0.0014, + "step": 4865 + }, + { + "epoch": 1.2842264847406235, + "grad_norm": 0.0032645913306623697, + "learning_rate": 7.161894709064521e-05, + "loss": 0.0026, + "step": 4870 + }, + { + "epoch": 1.2855447894008305, + "grad_norm": 0.27384671568870544, + "learning_rate": 7.148700356247527e-05, + "loss": 0.0227, + "step": 4875 + }, + { + "epoch": 1.2868630940610375, + "grad_norm": 0.03584875538945198, + "learning_rate": 7.135506003430532e-05, + "loss": 0.0299, + "step": 4880 + }, + { + "epoch": 1.2881813987212445, + "grad_norm": 0.03482440486550331, + "learning_rate": 7.122311650613538e-05, + "loss": 0.0125, + "step": 4885 + }, + { + "epoch": 1.2894997033814515, + "grad_norm": 0.005974395200610161, + "learning_rate": 7.109117297796543e-05, + "loss": 0.0029, + "step": 4890 + }, + { + "epoch": 1.2908180080416585, + "grad_norm": 0.01820153370499611, + "learning_rate": 7.095922944979549e-05, + "loss": 0.0254, + "step": 4895 + }, + { + "epoch": 1.2921363127018655, + "grad_norm": 0.1733965277671814, + "learning_rate": 7.082728592162555e-05, + "loss": 0.028, + "step": 4900 + }, + { + "epoch": 1.2934546173620725, + "grad_norm": 1.3017303943634033, + "learning_rate": 7.06953423934556e-05, + "loss": 0.0213, + "step": 4905 + }, + { + "epoch": 1.2947729220222794, + "grad_norm": 0.01360877975821495, + "learning_rate": 7.056339886528566e-05, + "loss": 0.0039, + "step": 4910 + }, + { + "epoch": 1.2960912266824862, + "grad_norm": 0.01503999624401331, + "learning_rate": 7.043145533711571e-05, + "loss": 0.0102, + "step": 4915 + }, + { + "epoch": 1.2974095313426934, + "grad_norm": 0.2200804352760315, + "learning_rate": 7.029951180894577e-05, + "loss": 0.0461, + "step": 4920 + }, + { + "epoch": 1.2987278360029002, + "grad_norm": 0.08512946963310242, + "learning_rate": 7.016756828077582e-05, + "loss": 0.0066, + "step": 4925 + }, + { + "epoch": 1.3000461406631072, + "grad_norm": 0.08296570926904678, + "learning_rate": 7.00356247526059e-05, + "loss": 0.0223, + "step": 4930 + }, + { + "epoch": 1.3013644453233142, + "grad_norm": 0.008866079151630402, + "learning_rate": 6.990368122443595e-05, + "loss": 0.0032, + "step": 4935 + }, + { + "epoch": 1.3026827499835212, + "grad_norm": 0.024493014439940453, + "learning_rate": 6.9771737696266e-05, + "loss": 0.0128, + "step": 4940 + }, + { + "epoch": 1.3040010546437282, + "grad_norm": 0.08965341746807098, + "learning_rate": 6.963979416809606e-05, + "loss": 0.028, + "step": 4945 + }, + { + "epoch": 1.3053193593039352, + "grad_norm": 0.023156631737947464, + "learning_rate": 6.950785063992612e-05, + "loss": 0.0187, + "step": 4950 + }, + { + "epoch": 1.3066376639641422, + "grad_norm": 0.18552155792713165, + "learning_rate": 6.937590711175617e-05, + "loss": 0.0424, + "step": 4955 + }, + { + "epoch": 1.3079559686243492, + "grad_norm": 0.02200198918581009, + "learning_rate": 6.924396358358623e-05, + "loss": 0.0148, + "step": 4960 + }, + { + "epoch": 1.3092742732845561, + "grad_norm": 0.00568364467471838, + "learning_rate": 6.911202005541628e-05, + "loss": 0.0199, + "step": 4965 + }, + { + "epoch": 1.310592577944763, + "grad_norm": 0.021591177210211754, + "learning_rate": 6.898007652724634e-05, + "loss": 0.0092, + "step": 4970 + }, + { + "epoch": 1.3119108826049701, + "grad_norm": 0.327177494764328, + "learning_rate": 6.88481329990764e-05, + "loss": 0.0047, + "step": 4975 + }, + { + "epoch": 1.313229187265177, + "grad_norm": 0.024512887001037598, + "learning_rate": 6.871618947090645e-05, + "loss": 0.0046, + "step": 4980 + }, + { + "epoch": 1.314547491925384, + "grad_norm": 0.05725006014108658, + "learning_rate": 6.858424594273652e-05, + "loss": 0.0227, + "step": 4985 + }, + { + "epoch": 1.3158657965855909, + "grad_norm": 0.011280277743935585, + "learning_rate": 6.845230241456658e-05, + "loss": 0.0056, + "step": 4990 + }, + { + "epoch": 1.3171841012457979, + "grad_norm": 0.022504402324557304, + "learning_rate": 6.832035888639663e-05, + "loss": 0.0029, + "step": 4995 + }, + { + "epoch": 1.3185024059060049, + "grad_norm": 0.02168826013803482, + "learning_rate": 6.818841535822669e-05, + "loss": 0.0198, + "step": 5000 + }, + { + "epoch": 1.3185024059060049, + "eval_loss": 0.025039294734597206, + "eval_runtime": 452.1097, + "eval_samples_per_second": 7.458, + "eval_steps_per_second": 3.729, + "step": 5000 + } + ], + "logging_steps": 5, + "max_steps": 7584, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.035545248314102e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5000/training_args.bin b/checkpoint-5000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4c05e0585b1aef03c0cbe4c50207ce04940bd838 --- /dev/null +++ b/checkpoint-5000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79dfa687fdd0c9908ab6b63535817e7567b29b0b483ac228723218f6f5fdeec5 +size 5688 diff --git a/checkpoint-5500/README.md b/checkpoint-5500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e738b5eda9883f4a45efd9d37b8b8357ba758456 --- /dev/null +++ b/checkpoint-5500/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/Llama-3.2-3B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-5500/adapter_config.json b/checkpoint-5500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..852d02d4dc1204f4332be8b3363041a3d3e3feb3 --- /dev/null +++ b/checkpoint-5500/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/Llama-3.2-3B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "gate_proj", + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-5500/adapter_model.safetensors b/checkpoint-5500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4b613576b23842804e03752443dd6e610a31227b --- /dev/null +++ b/checkpoint-5500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35fbf4f82fd233fc606a3e5c2a38e71165c5c9b0396bc2db7683992c69373a28 +size 97307544 diff --git a/checkpoint-5500/optimizer.pt b/checkpoint-5500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..78e0ec658a653780a4adb9f92227053dc8231d8d --- /dev/null +++ b/checkpoint-5500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:089d4e55c2b86901b7831815c9b5e4d6457ad75b3c6ade35223aeb1d1462add0 +size 50866370 diff --git a/checkpoint-5500/rng_state.pth b/checkpoint-5500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..bf5195d0b004aac5de0cad819adc5c38bcd3f90e --- /dev/null +++ b/checkpoint-5500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5db32b72ab9baad7926b63babb28dd6717f3b89c3ac96e844ccbe32a5d31d0cb +size 14244 diff --git a/checkpoint-5500/scheduler.pt b/checkpoint-5500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..05dcd60dcf55df978d556d6efa6bdbc1fb62b833 --- /dev/null +++ b/checkpoint-5500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adc69eb084aa5f49fb7ba41e2edc2d7a96de7528fc08988a6f324ad5771e0b96 +size 1064 diff --git a/checkpoint-5500/special_tokens_map.json b/checkpoint-5500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-5500/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-5500/tokenizer.json b/checkpoint-5500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-5500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-5500/tokenizer_config.json b/checkpoint-5500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfd835b0ef1033e89629af6e13ae45397e9fa2f8 --- /dev/null +++ b/checkpoint-5500/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 July 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-5500/trainer_state.json b/checkpoint-5500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c81e1713112409df9cf8279cf9c3309b7b5bf7d3 --- /dev/null +++ b/checkpoint-5500/trainer_state.json @@ -0,0 +1,7824 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.4503328719267023, + "eval_steps": 500, + "global_step": 5500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001318304660206974, + "grad_norm": 4.59375, + "learning_rate": 0.0002, + "loss": 1.9624, + "step": 5 + }, + { + "epoch": 0.002636609320413948, + "grad_norm": 1.7421875, + "learning_rate": 0.00019986805647183008, + "loss": 0.6513, + "step": 10 + }, + { + "epoch": 0.003954913980620921, + "grad_norm": 1.84375, + "learning_rate": 0.00019973611294366012, + "loss": 0.1146, + "step": 15 + }, + { + "epoch": 0.005273218640827896, + "grad_norm": 1.3203125, + "learning_rate": 0.0001996041694154902, + "loss": 0.0529, + "step": 20 + }, + { + "epoch": 0.006591523301034869, + "grad_norm": 0.40234375, + "learning_rate": 0.00019947222588732023, + "loss": 0.1214, + "step": 25 + }, + { + "epoch": 0.007909827961241843, + "grad_norm": 1.5390625, + "learning_rate": 0.0001993402823591503, + "loss": 0.0919, + "step": 30 + }, + { + "epoch": 0.009228132621448816, + "grad_norm": 0.06201171875, + "learning_rate": 0.00019920833883098034, + "loss": 0.09, + "step": 35 + }, + { + "epoch": 0.010546437281655791, + "grad_norm": 1.53125, + "learning_rate": 0.0001990763953028104, + "loss": 0.1945, + "step": 40 + }, + { + "epoch": 0.011864741941862765, + "grad_norm": 0.2890625, + "learning_rate": 0.00019894445177464048, + "loss": 0.1259, + "step": 45 + }, + { + "epoch": 0.013183046602069738, + "grad_norm": 0.609375, + "learning_rate": 0.00019881250824647052, + "loss": 0.027, + "step": 50 + }, + { + "epoch": 0.014501351262276712, + "grad_norm": 0.369140625, + "learning_rate": 0.00019868056471830057, + "loss": 0.1068, + "step": 55 + }, + { + "epoch": 0.015819655922483685, + "grad_norm": 0.34765625, + "learning_rate": 0.00019854862119013064, + "loss": 0.0542, + "step": 60 + }, + { + "epoch": 0.01713796058269066, + "grad_norm": 0.055419921875, + "learning_rate": 0.00019841667766196068, + "loss": 0.0901, + "step": 65 + }, + { + "epoch": 0.018456265242897632, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00019828473413379075, + "loss": 0.0091, + "step": 70 + }, + { + "epoch": 0.019774569903104607, + "grad_norm": 0.0079345703125, + "learning_rate": 0.0001981527906056208, + "loss": 0.0744, + "step": 75 + }, + { + "epoch": 0.021092874563311582, + "grad_norm": 0.65234375, + "learning_rate": 0.00019802084707745086, + "loss": 0.1108, + "step": 80 + }, + { + "epoch": 0.022411179223518554, + "grad_norm": 0.50390625, + "learning_rate": 0.0001978889035492809, + "loss": 0.0446, + "step": 85 + }, + { + "epoch": 0.02372948388372553, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019775696002111097, + "loss": 0.0982, + "step": 90 + }, + { + "epoch": 0.0250477885439325, + "grad_norm": 0.490234375, + "learning_rate": 0.00019762501649294104, + "loss": 0.1035, + "step": 95 + }, + { + "epoch": 0.026366093204139476, + "grad_norm": 0.12158203125, + "learning_rate": 0.00019749307296477108, + "loss": 0.0401, + "step": 100 + }, + { + "epoch": 0.02768439786434645, + "grad_norm": 0.16015625, + "learning_rate": 0.00019736112943660115, + "loss": 0.0309, + "step": 105 + }, + { + "epoch": 0.029002702524553423, + "grad_norm": 1.359375, + "learning_rate": 0.0001972291859084312, + "loss": 0.1032, + "step": 110 + }, + { + "epoch": 0.0303210071847604, + "grad_norm": 0.52734375, + "learning_rate": 0.00019709724238026126, + "loss": 0.0811, + "step": 115 + }, + { + "epoch": 0.03163931184496737, + "grad_norm": 0.177734375, + "learning_rate": 0.00019696529885209133, + "loss": 0.0258, + "step": 120 + }, + { + "epoch": 0.03295761650517435, + "grad_norm": 0.234375, + "learning_rate": 0.00019683335532392137, + "loss": 0.0437, + "step": 125 + }, + { + "epoch": 0.03427592116538132, + "grad_norm": 1.3046875, + "learning_rate": 0.00019670141179575144, + "loss": 0.0967, + "step": 130 + }, + { + "epoch": 0.03559422582558829, + "grad_norm": 0.2734375, + "learning_rate": 0.00019656946826758148, + "loss": 0.0132, + "step": 135 + }, + { + "epoch": 0.036912530485795264, + "grad_norm": 0.66015625, + "learning_rate": 0.00019643752473941155, + "loss": 0.0396, + "step": 140 + }, + { + "epoch": 0.03823083514600224, + "grad_norm": 1.0546875, + "learning_rate": 0.0001963055812112416, + "loss": 0.0449, + "step": 145 + }, + { + "epoch": 0.039549139806209214, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019617363768307166, + "loss": 0.1196, + "step": 150 + }, + { + "epoch": 0.040867444466416186, + "grad_norm": 0.5859375, + "learning_rate": 0.0001960416941549017, + "loss": 0.0588, + "step": 155 + }, + { + "epoch": 0.042185749126623165, + "grad_norm": 0.06005859375, + "learning_rate": 0.00019590975062673175, + "loss": 0.0234, + "step": 160 + }, + { + "epoch": 0.04350405378683014, + "grad_norm": 0.4921875, + "learning_rate": 0.00019577780709856182, + "loss": 0.0916, + "step": 165 + }, + { + "epoch": 0.04482235844703711, + "grad_norm": 0.84375, + "learning_rate": 0.0001956458635703919, + "loss": 0.0271, + "step": 170 + }, + { + "epoch": 0.04614066310724409, + "grad_norm": 0.8828125, + "learning_rate": 0.00019551392004222193, + "loss": 0.0175, + "step": 175 + }, + { + "epoch": 0.04745896776745106, + "grad_norm": 0.0152587890625, + "learning_rate": 0.000195381976514052, + "loss": 0.0356, + "step": 180 + }, + { + "epoch": 0.04877727242765803, + "grad_norm": 0.09326171875, + "learning_rate": 0.00019525003298588204, + "loss": 0.0057, + "step": 185 + }, + { + "epoch": 0.050095577087865, + "grad_norm": 0.24609375, + "learning_rate": 0.0001951180894577121, + "loss": 0.0082, + "step": 190 + }, + { + "epoch": 0.05141388174807198, + "grad_norm": 0.05029296875, + "learning_rate": 0.00019498614592954215, + "loss": 0.0178, + "step": 195 + }, + { + "epoch": 0.05273218640827895, + "grad_norm": 0.0390625, + "learning_rate": 0.00019485420240137222, + "loss": 0.0789, + "step": 200 + }, + { + "epoch": 0.054050491068485924, + "grad_norm": 0.5625, + "learning_rate": 0.0001947222588732023, + "loss": 0.0645, + "step": 205 + }, + { + "epoch": 0.0553687957286929, + "grad_norm": 0.53515625, + "learning_rate": 0.00019459031534503233, + "loss": 0.116, + "step": 210 + }, + { + "epoch": 0.056687100388899875, + "grad_norm": 0.55078125, + "learning_rate": 0.0001944583718168624, + "loss": 0.0516, + "step": 215 + }, + { + "epoch": 0.058005405049106847, + "grad_norm": 0.314453125, + "learning_rate": 0.00019432642828869244, + "loss": 0.1019, + "step": 220 + }, + { + "epoch": 0.059323709709313825, + "grad_norm": 0.1123046875, + "learning_rate": 0.0001941944847605225, + "loss": 0.0529, + "step": 225 + }, + { + "epoch": 0.0606420143695208, + "grad_norm": 0.4921875, + "learning_rate": 0.00019406254123235256, + "loss": 0.0368, + "step": 230 + }, + { + "epoch": 0.06196031902972777, + "grad_norm": 0.054443359375, + "learning_rate": 0.00019393059770418262, + "loss": 0.037, + "step": 235 + }, + { + "epoch": 0.06327862368993474, + "grad_norm": 0.008544921875, + "learning_rate": 0.0001937986541760127, + "loss": 0.0324, + "step": 240 + }, + { + "epoch": 0.06459692835014172, + "grad_norm": 1.5, + "learning_rate": 0.00019366671064784274, + "loss": 0.0334, + "step": 245 + }, + { + "epoch": 0.0659152330103487, + "grad_norm": 0.2109375, + "learning_rate": 0.0001935347671196728, + "loss": 0.0671, + "step": 250 + }, + { + "epoch": 0.06723353767055566, + "grad_norm": 2.0625, + "learning_rate": 0.00019340282359150285, + "loss": 0.1559, + "step": 255 + }, + { + "epoch": 0.06855184233076264, + "grad_norm": 0.7734375, + "learning_rate": 0.0001932708800633329, + "loss": 0.0198, + "step": 260 + }, + { + "epoch": 0.06987014699096962, + "grad_norm": 0.42578125, + "learning_rate": 0.00019313893653516296, + "loss": 0.0151, + "step": 265 + }, + { + "epoch": 0.07118845165117658, + "grad_norm": 0.1884765625, + "learning_rate": 0.000193006993006993, + "loss": 0.0269, + "step": 270 + }, + { + "epoch": 0.07250675631138356, + "grad_norm": 1.546875, + "learning_rate": 0.00019287504947882307, + "loss": 0.0565, + "step": 275 + }, + { + "epoch": 0.07382506097159053, + "grad_norm": 0.5078125, + "learning_rate": 0.0001927431059506531, + "loss": 0.0942, + "step": 280 + }, + { + "epoch": 0.0751433656317975, + "grad_norm": 0.392578125, + "learning_rate": 0.00019261116242248318, + "loss": 0.0061, + "step": 285 + }, + { + "epoch": 0.07646167029200449, + "grad_norm": 1.9140625, + "learning_rate": 0.00019247921889431325, + "loss": 0.0497, + "step": 290 + }, + { + "epoch": 0.07777997495221145, + "grad_norm": 0.08837890625, + "learning_rate": 0.0001923472753661433, + "loss": 0.0573, + "step": 295 + }, + { + "epoch": 0.07909827961241843, + "grad_norm": 1.046875, + "learning_rate": 0.00019221533183797336, + "loss": 0.0528, + "step": 300 + }, + { + "epoch": 0.08041658427262541, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001920833883098034, + "loss": 0.0506, + "step": 305 + }, + { + "epoch": 0.08173488893283237, + "grad_norm": 0.08203125, + "learning_rate": 0.00019195144478163347, + "loss": 0.0307, + "step": 310 + }, + { + "epoch": 0.08305319359303935, + "grad_norm": 0.111328125, + "learning_rate": 0.00019181950125346354, + "loss": 0.0365, + "step": 315 + }, + { + "epoch": 0.08437149825324633, + "grad_norm": 1.2890625, + "learning_rate": 0.00019168755772529358, + "loss": 0.0447, + "step": 320 + }, + { + "epoch": 0.0856898029134533, + "grad_norm": 0.6015625, + "learning_rate": 0.00019155561419712365, + "loss": 0.0605, + "step": 325 + }, + { + "epoch": 0.08700810757366027, + "grad_norm": 0.71875, + "learning_rate": 0.0001914236706689537, + "loss": 0.0846, + "step": 330 + }, + { + "epoch": 0.08832641223386725, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019129172714078376, + "loss": 0.0713, + "step": 335 + }, + { + "epoch": 0.08964471689407422, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001911597836126138, + "loss": 0.0826, + "step": 340 + }, + { + "epoch": 0.0909630215542812, + "grad_norm": 2.203125, + "learning_rate": 0.00019102784008444388, + "loss": 0.0441, + "step": 345 + }, + { + "epoch": 0.09228132621448817, + "grad_norm": 1.21875, + "learning_rate": 0.00019089589655627395, + "loss": 0.1378, + "step": 350 + }, + { + "epoch": 0.09359963087469514, + "grad_norm": 3.0625, + "learning_rate": 0.00019076395302810396, + "loss": 0.1552, + "step": 355 + }, + { + "epoch": 0.09491793553490212, + "grad_norm": 0.232421875, + "learning_rate": 0.00019063200949993403, + "loss": 0.0458, + "step": 360 + }, + { + "epoch": 0.0962362401951091, + "grad_norm": 0.71875, + "learning_rate": 0.0001905000659717641, + "loss": 0.0312, + "step": 365 + }, + { + "epoch": 0.09755454485531606, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00019036812244359414, + "loss": 0.0247, + "step": 370 + }, + { + "epoch": 0.09887284951552304, + "grad_norm": 0.064453125, + "learning_rate": 0.0001902361789154242, + "loss": 0.054, + "step": 375 + }, + { + "epoch": 0.10019115417573, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019010423538725425, + "loss": 0.0023, + "step": 380 + }, + { + "epoch": 0.10150945883593698, + "grad_norm": 0.0361328125, + "learning_rate": 0.00018997229185908432, + "loss": 0.0884, + "step": 385 + }, + { + "epoch": 0.10282776349614396, + "grad_norm": 1.703125, + "learning_rate": 0.00018984034833091436, + "loss": 0.0506, + "step": 390 + }, + { + "epoch": 0.10414606815635093, + "grad_norm": 0.08837890625, + "learning_rate": 0.00018970840480274443, + "loss": 0.1123, + "step": 395 + }, + { + "epoch": 0.1054643728165579, + "grad_norm": 0.6953125, + "learning_rate": 0.0001895764612745745, + "loss": 0.0597, + "step": 400 + }, + { + "epoch": 0.10678267747676488, + "grad_norm": 0.18359375, + "learning_rate": 0.00018944451774640454, + "loss": 0.0138, + "step": 405 + }, + { + "epoch": 0.10810098213697185, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001893125742182346, + "loss": 0.0249, + "step": 410 + }, + { + "epoch": 0.10941928679717883, + "grad_norm": 0.00970458984375, + "learning_rate": 0.00018918063069006466, + "loss": 0.0084, + "step": 415 + }, + { + "epoch": 0.1107375914573858, + "grad_norm": 0.54296875, + "learning_rate": 0.00018904868716189472, + "loss": 0.0541, + "step": 420 + }, + { + "epoch": 0.11205589611759277, + "grad_norm": 0.74609375, + "learning_rate": 0.00018891674363372477, + "loss": 0.007, + "step": 425 + }, + { + "epoch": 0.11337420077779975, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00018878480010555484, + "loss": 0.0875, + "step": 430 + }, + { + "epoch": 0.11469250543800673, + "grad_norm": 0.9296875, + "learning_rate": 0.0001886528565773849, + "loss": 0.1207, + "step": 435 + }, + { + "epoch": 0.11601081009821369, + "grad_norm": 1.2734375, + "learning_rate": 0.00018852091304921495, + "loss": 0.1143, + "step": 440 + }, + { + "epoch": 0.11732911475842067, + "grad_norm": 0.6484375, + "learning_rate": 0.00018838896952104502, + "loss": 0.0393, + "step": 445 + }, + { + "epoch": 0.11864741941862765, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018825702599287506, + "loss": 0.02, + "step": 450 + }, + { + "epoch": 0.11996572407883462, + "grad_norm": 0.486328125, + "learning_rate": 0.0001881250824647051, + "loss": 0.0891, + "step": 455 + }, + { + "epoch": 0.1212840287390416, + "grad_norm": 1.0, + "learning_rate": 0.00018799313893653517, + "loss": 0.0469, + "step": 460 + }, + { + "epoch": 0.12260233339924857, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001878611954083652, + "loss": 0.019, + "step": 465 + }, + { + "epoch": 0.12392063805945554, + "grad_norm": 0.03857421875, + "learning_rate": 0.00018772925188019528, + "loss": 0.007, + "step": 470 + }, + { + "epoch": 0.12523894271966252, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00018759730835202532, + "loss": 0.0039, + "step": 475 + }, + { + "epoch": 0.12655724737986948, + "grad_norm": 0.014404296875, + "learning_rate": 0.0001874653648238554, + "loss": 0.0043, + "step": 480 + }, + { + "epoch": 0.12787555204007647, + "grad_norm": 0.51953125, + "learning_rate": 0.00018733342129568546, + "loss": 0.1326, + "step": 485 + }, + { + "epoch": 0.12919385670028344, + "grad_norm": 0.99609375, + "learning_rate": 0.0001872014777675155, + "loss": 0.0369, + "step": 490 + }, + { + "epoch": 0.1305121613604904, + "grad_norm": 0.2734375, + "learning_rate": 0.00018706953423934557, + "loss": 0.0395, + "step": 495 + }, + { + "epoch": 0.1318304660206974, + "grad_norm": 0.083984375, + "learning_rate": 0.00018693759071117561, + "loss": 0.0284, + "step": 500 + }, + { + "epoch": 0.1318304660206974, + "eval_loss": 0.04542969539761543, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.5293, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 3.685, + "step": 500 + }, + { + "epoch": 0.13314877068090436, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00018680564718300568, + "loss": 0.0533, + "step": 505 + }, + { + "epoch": 0.13446707534111133, + "grad_norm": 0.71484375, + "learning_rate": 0.00018667370365483575, + "loss": 0.0183, + "step": 510 + }, + { + "epoch": 0.13578538000131832, + "grad_norm": 0.018798828125, + "learning_rate": 0.0001865417601266658, + "loss": 0.0473, + "step": 515 + }, + { + "epoch": 0.13710368466152528, + "grad_norm": 0.388671875, + "learning_rate": 0.00018640981659849586, + "loss": 0.0562, + "step": 520 + }, + { + "epoch": 0.13842198932173225, + "grad_norm": 0.77734375, + "learning_rate": 0.0001862778730703259, + "loss": 0.0755, + "step": 525 + }, + { + "epoch": 0.13974029398193924, + "grad_norm": 2.8125, + "learning_rate": 0.00018614592954215598, + "loss": 0.0422, + "step": 530 + }, + { + "epoch": 0.1410585986421462, + "grad_norm": 0.48828125, + "learning_rate": 0.00018601398601398602, + "loss": 0.0882, + "step": 535 + }, + { + "epoch": 0.14237690330235317, + "grad_norm": 0.16015625, + "learning_rate": 0.0001858820424858161, + "loss": 0.0131, + "step": 540 + }, + { + "epoch": 0.14369520796256013, + "grad_norm": 0.31640625, + "learning_rate": 0.00018575009895764616, + "loss": 0.03, + "step": 545 + }, + { + "epoch": 0.14501351262276713, + "grad_norm": 0.0120849609375, + "learning_rate": 0.0001856181554294762, + "loss": 0.0425, + "step": 550 + }, + { + "epoch": 0.1463318172829741, + "grad_norm": 0.390625, + "learning_rate": 0.00018548621190130624, + "loss": 0.011, + "step": 555 + }, + { + "epoch": 0.14765012194318106, + "grad_norm": 1.9609375, + "learning_rate": 0.0001853542683731363, + "loss": 0.0807, + "step": 560 + }, + { + "epoch": 0.14896842660338805, + "grad_norm": 0.609375, + "learning_rate": 0.00018522232484496635, + "loss": 0.0278, + "step": 565 + }, + { + "epoch": 0.150286731263595, + "grad_norm": 0.087890625, + "learning_rate": 0.00018509038131679642, + "loss": 0.0484, + "step": 570 + }, + { + "epoch": 0.15160503592380198, + "grad_norm": 0.5078125, + "learning_rate": 0.00018495843778862646, + "loss": 0.1277, + "step": 575 + }, + { + "epoch": 0.15292334058400897, + "grad_norm": 0.8125, + "learning_rate": 0.00018482649426045653, + "loss": 0.058, + "step": 580 + }, + { + "epoch": 0.15424164524421594, + "grad_norm": 0.22265625, + "learning_rate": 0.00018469455073228657, + "loss": 0.0259, + "step": 585 + }, + { + "epoch": 0.1555599499044229, + "grad_norm": 1.8984375, + "learning_rate": 0.00018456260720411664, + "loss": 0.113, + "step": 590 + }, + { + "epoch": 0.1568782545646299, + "grad_norm": 0.12451171875, + "learning_rate": 0.0001844306636759467, + "loss": 0.0312, + "step": 595 + }, + { + "epoch": 0.15819655922483686, + "grad_norm": 0.0322265625, + "learning_rate": 0.00018429872014777676, + "loss": 0.0476, + "step": 600 + }, + { + "epoch": 0.15951486388504382, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00018416677661960682, + "loss": 0.0232, + "step": 605 + }, + { + "epoch": 0.16083316854525082, + "grad_norm": 0.57421875, + "learning_rate": 0.00018403483309143687, + "loss": 0.1287, + "step": 610 + }, + { + "epoch": 0.16215147320545778, + "grad_norm": 0.765625, + "learning_rate": 0.00018390288956326694, + "loss": 0.0991, + "step": 615 + }, + { + "epoch": 0.16346977786566474, + "grad_norm": 0.3125, + "learning_rate": 0.00018377094603509698, + "loss": 0.0247, + "step": 620 + }, + { + "epoch": 0.16478808252587174, + "grad_norm": 0.37890625, + "learning_rate": 0.00018363900250692705, + "loss": 0.0632, + "step": 625 + }, + { + "epoch": 0.1661063871860787, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018350705897875712, + "loss": 0.0314, + "step": 630 + }, + { + "epoch": 0.16742469184628567, + "grad_norm": 0.0673828125, + "learning_rate": 0.00018337511545058716, + "loss": 0.0425, + "step": 635 + }, + { + "epoch": 0.16874299650649266, + "grad_norm": 0.396484375, + "learning_rate": 0.00018324317192241723, + "loss": 0.0613, + "step": 640 + }, + { + "epoch": 0.17006130116669962, + "grad_norm": 0.057373046875, + "learning_rate": 0.00018311122839424727, + "loss": 0.0569, + "step": 645 + }, + { + "epoch": 0.1713796058269066, + "grad_norm": 0.001373291015625, + "learning_rate": 0.00018297928486607734, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 0.17269791048711358, + "grad_norm": 1.0859375, + "learning_rate": 0.00018284734133790738, + "loss": 0.0189, + "step": 655 + }, + { + "epoch": 0.17401621514732055, + "grad_norm": 0.6015625, + "learning_rate": 0.00018271539780973742, + "loss": 0.0601, + "step": 660 + }, + { + "epoch": 0.1753345198075275, + "grad_norm": 0.25390625, + "learning_rate": 0.0001825834542815675, + "loss": 0.0211, + "step": 665 + }, + { + "epoch": 0.1766528244677345, + "grad_norm": 2.6875, + "learning_rate": 0.00018245151075339753, + "loss": 0.0713, + "step": 670 + }, + { + "epoch": 0.17797112912794147, + "grad_norm": 1.1875, + "learning_rate": 0.0001823195672252276, + "loss": 0.0522, + "step": 675 + }, + { + "epoch": 0.17928943378814843, + "grad_norm": 0.025146484375, + "learning_rate": 0.00018218762369705767, + "loss": 0.0242, + "step": 680 + }, + { + "epoch": 0.18060773844835543, + "grad_norm": 0.048095703125, + "learning_rate": 0.00018205568016888772, + "loss": 0.0129, + "step": 685 + }, + { + "epoch": 0.1819260431085624, + "grad_norm": 0.04541015625, + "learning_rate": 0.00018192373664071778, + "loss": 0.0142, + "step": 690 + }, + { + "epoch": 0.18324434776876936, + "grad_norm": 0.00830078125, + "learning_rate": 0.00018179179311254783, + "loss": 0.0121, + "step": 695 + }, + { + "epoch": 0.18456265242897635, + "grad_norm": 0.53125, + "learning_rate": 0.0001816598495843779, + "loss": 0.0163, + "step": 700 + }, + { + "epoch": 0.1858809570891833, + "grad_norm": 0.185546875, + "learning_rate": 0.00018152790605620796, + "loss": 0.0203, + "step": 705 + }, + { + "epoch": 0.18719926174939028, + "grad_norm": 1.2578125, + "learning_rate": 0.000181395962528038, + "loss": 0.1548, + "step": 710 + }, + { + "epoch": 0.18851756640959727, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00018126401899986808, + "loss": 0.0543, + "step": 715 + }, + { + "epoch": 0.18983587106980424, + "grad_norm": 0.07568359375, + "learning_rate": 0.00018113207547169812, + "loss": 0.0346, + "step": 720 + }, + { + "epoch": 0.1911541757300112, + "grad_norm": 0.1318359375, + "learning_rate": 0.0001810001319435282, + "loss": 0.03, + "step": 725 + }, + { + "epoch": 0.1924724803902182, + "grad_norm": 0.1455078125, + "learning_rate": 0.00018086818841535823, + "loss": 0.0796, + "step": 730 + }, + { + "epoch": 0.19379078505042516, + "grad_norm": 0.09814453125, + "learning_rate": 0.0001807362448871883, + "loss": 0.0662, + "step": 735 + }, + { + "epoch": 0.19510908971063212, + "grad_norm": 0.91015625, + "learning_rate": 0.00018060430135901837, + "loss": 0.0675, + "step": 740 + }, + { + "epoch": 0.19642739437083911, + "grad_norm": 0.10693359375, + "learning_rate": 0.0001804723578308484, + "loss": 0.0377, + "step": 745 + }, + { + "epoch": 0.19774569903104608, + "grad_norm": 0.95703125, + "learning_rate": 0.00018034041430267848, + "loss": 0.0174, + "step": 750 + }, + { + "epoch": 0.19906400369125304, + "grad_norm": 1.7890625, + "learning_rate": 0.00018020847077450852, + "loss": 0.0278, + "step": 755 + }, + { + "epoch": 0.20038230835146, + "grad_norm": 0.8515625, + "learning_rate": 0.00018007652724633856, + "loss": 0.0113, + "step": 760 + }, + { + "epoch": 0.201700613011667, + "grad_norm": 0.016845703125, + "learning_rate": 0.00017994458371816863, + "loss": 0.0589, + "step": 765 + }, + { + "epoch": 0.20301891767187397, + "grad_norm": 0.01043701171875, + "learning_rate": 0.00017981264018999867, + "loss": 0.0203, + "step": 770 + }, + { + "epoch": 0.20433722233208093, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00017968069666182874, + "loss": 0.0494, + "step": 775 + }, + { + "epoch": 0.20565552699228792, + "grad_norm": 0.56640625, + "learning_rate": 0.00017954875313365879, + "loss": 0.0394, + "step": 780 + }, + { + "epoch": 0.2069738316524949, + "grad_norm": 0.06591796875, + "learning_rate": 0.00017941680960548886, + "loss": 0.0848, + "step": 785 + }, + { + "epoch": 0.20829213631270185, + "grad_norm": 0.40234375, + "learning_rate": 0.00017928486607731892, + "loss": 0.0464, + "step": 790 + }, + { + "epoch": 0.20961044097290885, + "grad_norm": 0.06298828125, + "learning_rate": 0.00017915292254914897, + "loss": 0.0222, + "step": 795 + }, + { + "epoch": 0.2109287456331158, + "grad_norm": 0.5390625, + "learning_rate": 0.00017902097902097904, + "loss": 0.0434, + "step": 800 + }, + { + "epoch": 0.21224705029332278, + "grad_norm": 1.390625, + "learning_rate": 0.00017888903549280908, + "loss": 0.0222, + "step": 805 + }, + { + "epoch": 0.21356535495352977, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00017875709196463915, + "loss": 0.0099, + "step": 810 + }, + { + "epoch": 0.21488365961373673, + "grad_norm": 0.10009765625, + "learning_rate": 0.0001786251484364692, + "loss": 0.0086, + "step": 815 + }, + { + "epoch": 0.2162019642739437, + "grad_norm": 0.06396484375, + "learning_rate": 0.00017849320490829926, + "loss": 0.0715, + "step": 820 + }, + { + "epoch": 0.2175202689341507, + "grad_norm": 0.365234375, + "learning_rate": 0.00017836126138012933, + "loss": 0.0642, + "step": 825 + }, + { + "epoch": 0.21883857359435765, + "grad_norm": 0.01519775390625, + "learning_rate": 0.00017822931785195937, + "loss": 0.0111, + "step": 830 + }, + { + "epoch": 0.22015687825456462, + "grad_norm": 1.1640625, + "learning_rate": 0.00017809737432378944, + "loss": 0.0518, + "step": 835 + }, + { + "epoch": 0.2214751829147716, + "grad_norm": 0.00921630859375, + "learning_rate": 0.00017796543079561948, + "loss": 0.0384, + "step": 840 + }, + { + "epoch": 0.22279348757497858, + "grad_norm": 0.33984375, + "learning_rate": 0.00017783348726744955, + "loss": 0.0204, + "step": 845 + }, + { + "epoch": 0.22411179223518554, + "grad_norm": 0.294921875, + "learning_rate": 0.00017770154373927962, + "loss": 0.0075, + "step": 850 + }, + { + "epoch": 0.22543009689539253, + "grad_norm": 0.033203125, + "learning_rate": 0.00017756960021110963, + "loss": 0.0895, + "step": 855 + }, + { + "epoch": 0.2267484015555995, + "grad_norm": 0.08056640625, + "learning_rate": 0.0001774376566829397, + "loss": 0.1039, + "step": 860 + }, + { + "epoch": 0.22806670621580646, + "grad_norm": 0.55078125, + "learning_rate": 0.00017730571315476975, + "loss": 0.0125, + "step": 865 + }, + { + "epoch": 0.22938501087601346, + "grad_norm": 0.5859375, + "learning_rate": 0.00017717376962659982, + "loss": 0.0381, + "step": 870 + }, + { + "epoch": 0.23070331553622042, + "grad_norm": 0.029052734375, + "learning_rate": 0.00017704182609842988, + "loss": 0.0434, + "step": 875 + }, + { + "epoch": 0.23202162019642739, + "grad_norm": 0.43359375, + "learning_rate": 0.00017690988257025993, + "loss": 0.0799, + "step": 880 + }, + { + "epoch": 0.23333992485663438, + "grad_norm": 0.04150390625, + "learning_rate": 0.00017677793904209, + "loss": 0.0692, + "step": 885 + }, + { + "epoch": 0.23465822951684134, + "grad_norm": 0.435546875, + "learning_rate": 0.00017664599551392004, + "loss": 0.0544, + "step": 890 + }, + { + "epoch": 0.2359765341770483, + "grad_norm": 1.171875, + "learning_rate": 0.0001765140519857501, + "loss": 0.0619, + "step": 895 + }, + { + "epoch": 0.2372948388372553, + "grad_norm": 0.01263427734375, + "learning_rate": 0.00017638210845758018, + "loss": 0.0418, + "step": 900 + }, + { + "epoch": 0.23861314349746227, + "grad_norm": 0.017578125, + "learning_rate": 0.00017625016492941022, + "loss": 0.0195, + "step": 905 + }, + { + "epoch": 0.23993144815766923, + "grad_norm": 0.6171875, + "learning_rate": 0.0001761182214012403, + "loss": 0.067, + "step": 910 + }, + { + "epoch": 0.24124975281787622, + "grad_norm": 0.59765625, + "learning_rate": 0.00017598627787307033, + "loss": 0.049, + "step": 915 + }, + { + "epoch": 0.2425680574780832, + "grad_norm": 1.2421875, + "learning_rate": 0.0001758543343449004, + "loss": 0.0539, + "step": 920 + }, + { + "epoch": 0.24388636213829015, + "grad_norm": 0.10302734375, + "learning_rate": 0.00017572239081673044, + "loss": 0.0725, + "step": 925 + }, + { + "epoch": 0.24520466679849715, + "grad_norm": 0.330078125, + "learning_rate": 0.0001755904472885605, + "loss": 0.064, + "step": 930 + }, + { + "epoch": 0.2465229714587041, + "grad_norm": 0.220703125, + "learning_rate": 0.00017545850376039058, + "loss": 0.0271, + "step": 935 + }, + { + "epoch": 0.24784127611891107, + "grad_norm": 0.01470947265625, + "learning_rate": 0.00017532656023222062, + "loss": 0.0247, + "step": 940 + }, + { + "epoch": 0.24915958077911807, + "grad_norm": 0.013427734375, + "learning_rate": 0.0001751946167040507, + "loss": 0.017, + "step": 945 + }, + { + "epoch": 0.25047788543932503, + "grad_norm": 0.58984375, + "learning_rate": 0.00017506267317588073, + "loss": 0.0254, + "step": 950 + }, + { + "epoch": 0.251796190099532, + "grad_norm": 0.412109375, + "learning_rate": 0.00017493072964771078, + "loss": 0.0186, + "step": 955 + }, + { + "epoch": 0.25311449475973896, + "grad_norm": 0.66796875, + "learning_rate": 0.00017479878611954084, + "loss": 0.0617, + "step": 960 + }, + { + "epoch": 0.25443279941994595, + "grad_norm": 0.322265625, + "learning_rate": 0.00017466684259137089, + "loss": 0.0173, + "step": 965 + }, + { + "epoch": 0.25575110408015295, + "grad_norm": 0.83203125, + "learning_rate": 0.00017453489906320096, + "loss": 0.0512, + "step": 970 + }, + { + "epoch": 0.2570694087403599, + "grad_norm": 0.08447265625, + "learning_rate": 0.000174402955535031, + "loss": 0.0361, + "step": 975 + }, + { + "epoch": 0.2583877134005669, + "grad_norm": 0.423828125, + "learning_rate": 0.00017427101200686107, + "loss": 0.0175, + "step": 980 + }, + { + "epoch": 0.25970601806077387, + "grad_norm": 0.77734375, + "learning_rate": 0.00017413906847869114, + "loss": 0.0139, + "step": 985 + }, + { + "epoch": 0.2610243227209808, + "grad_norm": 0.515625, + "learning_rate": 0.00017400712495052118, + "loss": 0.0948, + "step": 990 + }, + { + "epoch": 0.2623426273811878, + "grad_norm": 1.421875, + "learning_rate": 0.00017387518142235125, + "loss": 0.0406, + "step": 995 + }, + { + "epoch": 0.2636609320413948, + "grad_norm": 0.058837890625, + "learning_rate": 0.0001737432378941813, + "loss": 0.1011, + "step": 1000 + }, + { + "epoch": 0.2636609320413948, + "eval_loss": 0.045552924275398254, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.6113, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 3.684, + "step": 1000 + }, + { + "epoch": 0.26497923670160173, + "grad_norm": 0.380859375, + "learning_rate": 0.00017361129436601136, + "loss": 0.0711, + "step": 1005 + }, + { + "epoch": 0.2662975413618087, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00017347935083784143, + "loss": 0.0218, + "step": 1010 + }, + { + "epoch": 0.2676158460220157, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017334740730967147, + "loss": 0.0301, + "step": 1015 + }, + { + "epoch": 0.26893415068222265, + "grad_norm": 0.2734375, + "learning_rate": 0.00017321546378150154, + "loss": 0.0721, + "step": 1020 + }, + { + "epoch": 0.27025245534242964, + "grad_norm": 0.25390625, + "learning_rate": 0.00017308352025333158, + "loss": 0.0363, + "step": 1025 + }, + { + "epoch": 0.27157076000263664, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017295157672516165, + "loss": 0.0313, + "step": 1030 + }, + { + "epoch": 0.2728890646628436, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0001728196331969917, + "loss": 0.0385, + "step": 1035 + }, + { + "epoch": 0.27420736932305056, + "grad_norm": 0.00787353515625, + "learning_rate": 0.00017268768966882176, + "loss": 0.0405, + "step": 1040 + }, + { + "epoch": 0.27552567398325756, + "grad_norm": 0.484375, + "learning_rate": 0.00017255574614065183, + "loss": 0.0616, + "step": 1045 + }, + { + "epoch": 0.2768439786434645, + "grad_norm": 0.0908203125, + "learning_rate": 0.00017242380261248185, + "loss": 0.0057, + "step": 1050 + }, + { + "epoch": 0.2781622833036715, + "grad_norm": 0.1904296875, + "learning_rate": 0.00017229185908431192, + "loss": 0.0417, + "step": 1055 + }, + { + "epoch": 0.2794805879638785, + "grad_norm": 0.30078125, + "learning_rate": 0.00017215991555614196, + "loss": 0.0346, + "step": 1060 + }, + { + "epoch": 0.2807988926240854, + "grad_norm": 0.016357421875, + "learning_rate": 0.00017202797202797203, + "loss": 0.0295, + "step": 1065 + }, + { + "epoch": 0.2821171972842924, + "grad_norm": 0.490234375, + "learning_rate": 0.0001718960284998021, + "loss": 0.0448, + "step": 1070 + }, + { + "epoch": 0.28343550194449935, + "grad_norm": 0.004241943359375, + "learning_rate": 0.00017176408497163214, + "loss": 0.0051, + "step": 1075 + }, + { + "epoch": 0.28475380660470634, + "grad_norm": 0.01904296875, + "learning_rate": 0.0001716321414434622, + "loss": 0.0894, + "step": 1080 + }, + { + "epoch": 0.28607211126491333, + "grad_norm": 0.83984375, + "learning_rate": 0.00017150019791529225, + "loss": 0.0288, + "step": 1085 + }, + { + "epoch": 0.28739041592512027, + "grad_norm": 0.2021484375, + "learning_rate": 0.00017136825438712232, + "loss": 0.0222, + "step": 1090 + }, + { + "epoch": 0.28870872058532726, + "grad_norm": 0.322265625, + "learning_rate": 0.0001712363108589524, + "loss": 0.0444, + "step": 1095 + }, + { + "epoch": 0.29002702524553425, + "grad_norm": 0.408203125, + "learning_rate": 0.00017110436733078243, + "loss": 0.0828, + "step": 1100 + }, + { + "epoch": 0.2913453299057412, + "grad_norm": 0.04052734375, + "learning_rate": 0.0001709724238026125, + "loss": 0.0725, + "step": 1105 + }, + { + "epoch": 0.2926636345659482, + "grad_norm": 0.2578125, + "learning_rate": 0.00017084048027444254, + "loss": 0.0204, + "step": 1110 + }, + { + "epoch": 0.2939819392261552, + "grad_norm": 0.67578125, + "learning_rate": 0.0001707085367462726, + "loss": 0.0503, + "step": 1115 + }, + { + "epoch": 0.2953002438863621, + "grad_norm": 0.0059814453125, + "learning_rate": 0.00017057659321810265, + "loss": 0.0144, + "step": 1120 + }, + { + "epoch": 0.2966185485465691, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00017044464968993272, + "loss": 0.0044, + "step": 1125 + }, + { + "epoch": 0.2979368532067761, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001703127061617628, + "loss": 0.013, + "step": 1130 + }, + { + "epoch": 0.29925515786698303, + "grad_norm": 0.287109375, + "learning_rate": 0.00017018076263359283, + "loss": 0.0245, + "step": 1135 + }, + { + "epoch": 0.30057346252719, + "grad_norm": 0.26171875, + "learning_rate": 0.0001700488191054229, + "loss": 0.0247, + "step": 1140 + }, + { + "epoch": 0.301891767187397, + "grad_norm": 0.40625, + "learning_rate": 0.00016991687557725294, + "loss": 0.0402, + "step": 1145 + }, + { + "epoch": 0.30321007184760396, + "grad_norm": 1.2578125, + "learning_rate": 0.000169784932049083, + "loss": 0.0071, + "step": 1150 + }, + { + "epoch": 0.30452837650781095, + "grad_norm": 0.330078125, + "learning_rate": 0.00016965298852091306, + "loss": 0.0177, + "step": 1155 + }, + { + "epoch": 0.30584668116801794, + "grad_norm": 0.07275390625, + "learning_rate": 0.0001695210449927431, + "loss": 0.0029, + "step": 1160 + }, + { + "epoch": 0.3071649858282249, + "grad_norm": 0.455078125, + "learning_rate": 0.00016938910146457317, + "loss": 0.0262, + "step": 1165 + }, + { + "epoch": 0.30848329048843187, + "grad_norm": 0.002655029296875, + "learning_rate": 0.0001692571579364032, + "loss": 0.0346, + "step": 1170 + }, + { + "epoch": 0.30980159514863886, + "grad_norm": 0.1748046875, + "learning_rate": 0.00016912521440823328, + "loss": 0.0494, + "step": 1175 + }, + { + "epoch": 0.3111198998088458, + "grad_norm": 1.4609375, + "learning_rate": 0.00016899327088006335, + "loss": 0.0603, + "step": 1180 + }, + { + "epoch": 0.3124382044690528, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001688613273518934, + "loss": 0.0366, + "step": 1185 + }, + { + "epoch": 0.3137565091292598, + "grad_norm": 0.01422119140625, + "learning_rate": 0.00016872938382372346, + "loss": 0.0678, + "step": 1190 + }, + { + "epoch": 0.3150748137894667, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001685974402955535, + "loss": 0.0359, + "step": 1195 + }, + { + "epoch": 0.3163931184496737, + "grad_norm": 0.275390625, + "learning_rate": 0.00016846549676738357, + "loss": 0.1099, + "step": 1200 + }, + { + "epoch": 0.3177114231098807, + "grad_norm": 0.212890625, + "learning_rate": 0.00016833355323921364, + "loss": 0.0343, + "step": 1205 + }, + { + "epoch": 0.31902972777008765, + "grad_norm": 0.0302734375, + "learning_rate": 0.00016820160971104368, + "loss": 0.0138, + "step": 1210 + }, + { + "epoch": 0.32034803243029464, + "grad_norm": 0.016845703125, + "learning_rate": 0.00016806966618287375, + "loss": 0.0202, + "step": 1215 + }, + { + "epoch": 0.32166633709050163, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001679377226547038, + "loss": 0.0442, + "step": 1220 + }, + { + "epoch": 0.32298464175070857, + "grad_norm": 0.049072265625, + "learning_rate": 0.00016780577912653386, + "loss": 0.0375, + "step": 1225 + }, + { + "epoch": 0.32430294641091556, + "grad_norm": 0.1337890625, + "learning_rate": 0.0001676738355983639, + "loss": 0.01, + "step": 1230 + }, + { + "epoch": 0.32562125107112255, + "grad_norm": 0.02197265625, + "learning_rate": 0.00016754189207019397, + "loss": 0.0139, + "step": 1235 + }, + { + "epoch": 0.3269395557313295, + "grad_norm": 0.09228515625, + "learning_rate": 0.00016740994854202404, + "loss": 0.014, + "step": 1240 + }, + { + "epoch": 0.3282578603915365, + "grad_norm": 0.47265625, + "learning_rate": 0.00016727800501385408, + "loss": 0.1546, + "step": 1245 + }, + { + "epoch": 0.3295761650517435, + "grad_norm": 0.02294921875, + "learning_rate": 0.00016714606148568413, + "loss": 0.0803, + "step": 1250 + }, + { + "epoch": 0.3308944697119504, + "grad_norm": 0.185546875, + "learning_rate": 0.00016701411795751417, + "loss": 0.0376, + "step": 1255 + }, + { + "epoch": 0.3322127743721574, + "grad_norm": 0.1123046875, + "learning_rate": 0.00016688217442934424, + "loss": 0.0375, + "step": 1260 + }, + { + "epoch": 0.3335310790323644, + "grad_norm": 1.03125, + "learning_rate": 0.0001667502309011743, + "loss": 0.0442, + "step": 1265 + }, + { + "epoch": 0.33484938369257133, + "grad_norm": 0.0172119140625, + "learning_rate": 0.00016661828737300435, + "loss": 0.0261, + "step": 1270 + }, + { + "epoch": 0.3361676883527783, + "grad_norm": 0.42578125, + "learning_rate": 0.00016648634384483442, + "loss": 0.0553, + "step": 1275 + }, + { + "epoch": 0.3374859930129853, + "grad_norm": 0.1328125, + "learning_rate": 0.00016635440031666446, + "loss": 0.0065, + "step": 1280 + }, + { + "epoch": 0.33880429767319226, + "grad_norm": 0.263671875, + "learning_rate": 0.00016622245678849453, + "loss": 0.0527, + "step": 1285 + }, + { + "epoch": 0.34012260233339925, + "grad_norm": 0.314453125, + "learning_rate": 0.0001660905132603246, + "loss": 0.0297, + "step": 1290 + }, + { + "epoch": 0.34144090699360624, + "grad_norm": 0.04345703125, + "learning_rate": 0.00016595856973215464, + "loss": 0.0477, + "step": 1295 + }, + { + "epoch": 0.3427592116538132, + "grad_norm": 0.08154296875, + "learning_rate": 0.0001658266262039847, + "loss": 0.0298, + "step": 1300 + }, + { + "epoch": 0.34407751631402017, + "grad_norm": 0.08935546875, + "learning_rate": 0.00016569468267581475, + "loss": 0.0481, + "step": 1305 + }, + { + "epoch": 0.34539582097422716, + "grad_norm": 0.06640625, + "learning_rate": 0.00016556273914764482, + "loss": 0.0153, + "step": 1310 + }, + { + "epoch": 0.3467141256344341, + "grad_norm": 0.00592041015625, + "learning_rate": 0.00016543079561947486, + "loss": 0.0111, + "step": 1315 + }, + { + "epoch": 0.3480324302946411, + "grad_norm": 0.2236328125, + "learning_rate": 0.00016529885209130493, + "loss": 0.0309, + "step": 1320 + }, + { + "epoch": 0.3493507349548481, + "grad_norm": 0.0198974609375, + "learning_rate": 0.000165166908563135, + "loss": 0.0579, + "step": 1325 + }, + { + "epoch": 0.350669039615055, + "grad_norm": 0.10107421875, + "learning_rate": 0.00016503496503496504, + "loss": 0.0055, + "step": 1330 + }, + { + "epoch": 0.351987344275262, + "grad_norm": 0.71875, + "learning_rate": 0.00016490302150679511, + "loss": 0.0299, + "step": 1335 + }, + { + "epoch": 0.353305648935469, + "grad_norm": 0.01348876953125, + "learning_rate": 0.00016477107797862516, + "loss": 0.0943, + "step": 1340 + }, + { + "epoch": 0.35462395359567594, + "grad_norm": 0.3046875, + "learning_rate": 0.00016463913445045523, + "loss": 0.0216, + "step": 1345 + }, + { + "epoch": 0.35594225825588294, + "grad_norm": 0.02392578125, + "learning_rate": 0.00016450719092228527, + "loss": 0.0265, + "step": 1350 + }, + { + "epoch": 0.35726056291608993, + "grad_norm": 0.453125, + "learning_rate": 0.0001643752473941153, + "loss": 0.0539, + "step": 1355 + }, + { + "epoch": 0.35857886757629687, + "grad_norm": 0.00823974609375, + "learning_rate": 0.00016424330386594538, + "loss": 0.0139, + "step": 1360 + }, + { + "epoch": 0.35989717223650386, + "grad_norm": 0.55859375, + "learning_rate": 0.00016411136033777542, + "loss": 0.0428, + "step": 1365 + }, + { + "epoch": 0.36121547689671085, + "grad_norm": 0.052734375, + "learning_rate": 0.0001639794168096055, + "loss": 0.0346, + "step": 1370 + }, + { + "epoch": 0.3625337815569178, + "grad_norm": 0.12158203125, + "learning_rate": 0.00016384747328143556, + "loss": 0.0095, + "step": 1375 + }, + { + "epoch": 0.3638520862171248, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0001637155297532656, + "loss": 0.0224, + "step": 1380 + }, + { + "epoch": 0.3651703908773318, + "grad_norm": 0.01318359375, + "learning_rate": 0.00016358358622509567, + "loss": 0.0316, + "step": 1385 + }, + { + "epoch": 0.3664886955375387, + "grad_norm": 0.011962890625, + "learning_rate": 0.0001634516426969257, + "loss": 0.0051, + "step": 1390 + }, + { + "epoch": 0.3678070001977457, + "grad_norm": 0.00396728515625, + "learning_rate": 0.00016331969916875578, + "loss": 0.038, + "step": 1395 + }, + { + "epoch": 0.3691253048579527, + "grad_norm": 0.375, + "learning_rate": 0.00016318775564058585, + "loss": 0.029, + "step": 1400 + }, + { + "epoch": 0.37044360951815963, + "grad_norm": 0.265625, + "learning_rate": 0.0001630558121124159, + "loss": 0.0072, + "step": 1405 + }, + { + "epoch": 0.3717619141783666, + "grad_norm": 0.00127410888671875, + "learning_rate": 0.00016292386858424596, + "loss": 0.0381, + "step": 1410 + }, + { + "epoch": 0.3730802188385736, + "grad_norm": 1.15625, + "learning_rate": 0.000162791925056076, + "loss": 0.0573, + "step": 1415 + }, + { + "epoch": 0.37439852349878056, + "grad_norm": 0.0244140625, + "learning_rate": 0.00016265998152790607, + "loss": 0.051, + "step": 1420 + }, + { + "epoch": 0.37571682815898755, + "grad_norm": 0.0015106201171875, + "learning_rate": 0.00016252803799973612, + "loss": 0.0239, + "step": 1425 + }, + { + "epoch": 0.37703513281919454, + "grad_norm": 0.26953125, + "learning_rate": 0.00016239609447156618, + "loss": 0.0165, + "step": 1430 + }, + { + "epoch": 0.3783534374794015, + "grad_norm": 0.006134033203125, + "learning_rate": 0.00016226415094339625, + "loss": 0.0071, + "step": 1435 + }, + { + "epoch": 0.37967174213960847, + "grad_norm": 2.828125, + "learning_rate": 0.0001621322074152263, + "loss": 0.0272, + "step": 1440 + }, + { + "epoch": 0.38099004679981546, + "grad_norm": 0.349609375, + "learning_rate": 0.00016200026388705637, + "loss": 0.0647, + "step": 1445 + }, + { + "epoch": 0.3823083514600224, + "grad_norm": 0.09326171875, + "learning_rate": 0.00016186832035888638, + "loss": 0.0262, + "step": 1450 + }, + { + "epoch": 0.3836266561202294, + "grad_norm": 0.041015625, + "learning_rate": 0.00016173637683071645, + "loss": 0.0576, + "step": 1455 + }, + { + "epoch": 0.3849449607804364, + "grad_norm": 0.033935546875, + "learning_rate": 0.00016160443330254652, + "loss": 0.0142, + "step": 1460 + }, + { + "epoch": 0.3862632654406433, + "grad_norm": 0.09130859375, + "learning_rate": 0.00016147248977437656, + "loss": 0.0348, + "step": 1465 + }, + { + "epoch": 0.3875815701008503, + "grad_norm": 2.390625, + "learning_rate": 0.00016134054624620663, + "loss": 0.0672, + "step": 1470 + }, + { + "epoch": 0.3888998747610573, + "grad_norm": 0.439453125, + "learning_rate": 0.00016120860271803667, + "loss": 0.0121, + "step": 1475 + }, + { + "epoch": 0.39021817942126424, + "grad_norm": 0.1298828125, + "learning_rate": 0.00016107665918986674, + "loss": 0.0114, + "step": 1480 + }, + { + "epoch": 0.39153648408147124, + "grad_norm": 0.85546875, + "learning_rate": 0.0001609447156616968, + "loss": 0.0968, + "step": 1485 + }, + { + "epoch": 0.39285478874167823, + "grad_norm": 0.703125, + "learning_rate": 0.00016081277213352685, + "loss": 0.0349, + "step": 1490 + }, + { + "epoch": 0.39417309340188517, + "grad_norm": 0.021728515625, + "learning_rate": 0.00016068082860535692, + "loss": 0.0106, + "step": 1495 + }, + { + "epoch": 0.39549139806209216, + "grad_norm": 0.7265625, + "learning_rate": 0.00016054888507718696, + "loss": 0.0225, + "step": 1500 + }, + { + "epoch": 0.39549139806209216, + "eval_loss": 0.03515048325061798, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.3497, + "eval_samples_per_second": 7.373, + "eval_steps_per_second": 3.686, + "step": 1500 + }, + { + "epoch": 0.3968097027222991, + "grad_norm": 0.016519820317626, + "learning_rate": 0.00016041694154901703, + "loss": 0.0202, + "step": 1505 + }, + { + "epoch": 0.3981280073825061, + "grad_norm": 0.8505942225456238, + "learning_rate": 0.00016028499802084708, + "loss": 0.0541, + "step": 1510 + }, + { + "epoch": 0.3994463120427131, + "grad_norm": 0.04163295030593872, + "learning_rate": 0.00016015305449267714, + "loss": 0.0037, + "step": 1515 + }, + { + "epoch": 0.40076461670292, + "grad_norm": 0.011332935653626919, + "learning_rate": 0.00016002111096450721, + "loss": 0.0459, + "step": 1520 + }, + { + "epoch": 0.402082921363127, + "grad_norm": 0.9360129833221436, + "learning_rate": 0.00015988916743633726, + "loss": 0.013, + "step": 1525 + }, + { + "epoch": 0.403401226023334, + "grad_norm": 0.11991436779499054, + "learning_rate": 0.00015975722390816733, + "loss": 0.0079, + "step": 1530 + }, + { + "epoch": 0.40471953068354094, + "grad_norm": 0.36911076307296753, + "learning_rate": 0.00015962528037999737, + "loss": 0.0638, + "step": 1535 + }, + { + "epoch": 0.40603783534374793, + "grad_norm": 0.020278634503483772, + "learning_rate": 0.00015949333685182744, + "loss": 0.0217, + "step": 1540 + }, + { + "epoch": 0.4073561400039549, + "grad_norm": 0.14263059198856354, + "learning_rate": 0.0001593613933236575, + "loss": 0.0495, + "step": 1545 + }, + { + "epoch": 0.40867444466416186, + "grad_norm": 0.09494803845882416, + "learning_rate": 0.00015922944979548752, + "loss": 0.0248, + "step": 1550 + }, + { + "epoch": 0.40999274932436885, + "grad_norm": 0.23064319789409637, + "learning_rate": 0.0001590975062673176, + "loss": 0.0285, + "step": 1555 + }, + { + "epoch": 0.41131105398457585, + "grad_norm": 0.32220256328582764, + "learning_rate": 0.00015896556273914763, + "loss": 0.0537, + "step": 1560 + }, + { + "epoch": 0.4126293586447828, + "grad_norm": 0.41208815574645996, + "learning_rate": 0.0001588336192109777, + "loss": 0.0453, + "step": 1565 + }, + { + "epoch": 0.4139476633049898, + "grad_norm": 0.03775424137711525, + "learning_rate": 0.00015870167568280777, + "loss": 0.0134, + "step": 1570 + }, + { + "epoch": 0.41526596796519677, + "grad_norm": 0.6526333093643188, + "learning_rate": 0.0001585697321546378, + "loss": 0.0329, + "step": 1575 + }, + { + "epoch": 0.4165842726254037, + "grad_norm": 1.001305103302002, + "learning_rate": 0.00015843778862646788, + "loss": 0.0912, + "step": 1580 + }, + { + "epoch": 0.4179025772856107, + "grad_norm": 0.4055219888687134, + "learning_rate": 0.00015830584509829792, + "loss": 0.0519, + "step": 1585 + }, + { + "epoch": 0.4192208819458177, + "grad_norm": 0.035015616565942764, + "learning_rate": 0.000158173901570128, + "loss": 0.0191, + "step": 1590 + }, + { + "epoch": 0.42053918660602463, + "grad_norm": 0.09326844662427902, + "learning_rate": 0.00015804195804195806, + "loss": 0.0106, + "step": 1595 + }, + { + "epoch": 0.4218574912662316, + "grad_norm": 0.06223440542817116, + "learning_rate": 0.0001579100145137881, + "loss": 0.0113, + "step": 1600 + }, + { + "epoch": 0.4231757959264386, + "grad_norm": 0.0625135526061058, + "learning_rate": 0.00015777807098561817, + "loss": 0.0191, + "step": 1605 + }, + { + "epoch": 0.42449410058664555, + "grad_norm": 0.2645983099937439, + "learning_rate": 0.00015764612745744822, + "loss": 0.0829, + "step": 1610 + }, + { + "epoch": 0.42581240524685254, + "grad_norm": 0.009632415138185024, + "learning_rate": 0.00015751418392927829, + "loss": 0.0542, + "step": 1615 + }, + { + "epoch": 0.42713070990705954, + "grad_norm": 0.01979319378733635, + "learning_rate": 0.00015738224040110833, + "loss": 0.0517, + "step": 1620 + }, + { + "epoch": 0.4284490145672665, + "grad_norm": 0.3065454065799713, + "learning_rate": 0.0001572502968729384, + "loss": 0.0738, + "step": 1625 + }, + { + "epoch": 0.42976731922747347, + "grad_norm": 0.09581473469734192, + "learning_rate": 0.00015711835334476847, + "loss": 0.0571, + "step": 1630 + }, + { + "epoch": 0.43108562388768046, + "grad_norm": 0.23746591806411743, + "learning_rate": 0.0001569864098165985, + "loss": 0.0128, + "step": 1635 + }, + { + "epoch": 0.4324039285478874, + "grad_norm": 0.936278760433197, + "learning_rate": 0.00015685446628842858, + "loss": 0.0665, + "step": 1640 + }, + { + "epoch": 0.4337222332080944, + "grad_norm": 0.18487441539764404, + "learning_rate": 0.00015672252276025862, + "loss": 0.0527, + "step": 1645 + }, + { + "epoch": 0.4350405378683014, + "grad_norm": 0.6980624794960022, + "learning_rate": 0.00015659057923208866, + "loss": 0.0613, + "step": 1650 + }, + { + "epoch": 0.4363588425285083, + "grad_norm": 0.4696301221847534, + "learning_rate": 0.00015645863570391873, + "loss": 0.0569, + "step": 1655 + }, + { + "epoch": 0.4376771471887153, + "grad_norm": 0.15083105862140656, + "learning_rate": 0.00015632669217574877, + "loss": 0.0394, + "step": 1660 + }, + { + "epoch": 0.4389954518489223, + "grad_norm": 0.44701239466667175, + "learning_rate": 0.00015619474864757884, + "loss": 0.0494, + "step": 1665 + }, + { + "epoch": 0.44031375650912924, + "grad_norm": 0.07418403029441833, + "learning_rate": 0.00015606280511940888, + "loss": 0.0291, + "step": 1670 + }, + { + "epoch": 0.44163206116933623, + "grad_norm": 0.02311861515045166, + "learning_rate": 0.00015593086159123895, + "loss": 0.0304, + "step": 1675 + }, + { + "epoch": 0.4429503658295432, + "grad_norm": 0.4416038990020752, + "learning_rate": 0.00015579891806306902, + "loss": 0.0176, + "step": 1680 + }, + { + "epoch": 0.44426867048975016, + "grad_norm": 0.5124915242195129, + "learning_rate": 0.00015566697453489906, + "loss": 0.0454, + "step": 1685 + }, + { + "epoch": 0.44558697514995715, + "grad_norm": 0.3159286081790924, + "learning_rate": 0.00015553503100672913, + "loss": 0.047, + "step": 1690 + }, + { + "epoch": 0.44690527981016415, + "grad_norm": 0.032126396894454956, + "learning_rate": 0.00015540308747855918, + "loss": 0.0151, + "step": 1695 + }, + { + "epoch": 0.4482235844703711, + "grad_norm": 0.04663548618555069, + "learning_rate": 0.00015527114395038924, + "loss": 0.0375, + "step": 1700 + }, + { + "epoch": 0.4495418891305781, + "grad_norm": 0.013753900304436684, + "learning_rate": 0.0001551392004222193, + "loss": 0.0485, + "step": 1705 + }, + { + "epoch": 0.45086019379078507, + "grad_norm": 1.9952393770217896, + "learning_rate": 0.00015500725689404936, + "loss": 0.0625, + "step": 1710 + }, + { + "epoch": 0.452178498450992, + "grad_norm": 0.014283270575106144, + "learning_rate": 0.00015487531336587943, + "loss": 0.0037, + "step": 1715 + }, + { + "epoch": 0.453496803111199, + "grad_norm": 0.3897913098335266, + "learning_rate": 0.00015474336983770947, + "loss": 0.0304, + "step": 1720 + }, + { + "epoch": 0.454815107771406, + "grad_norm": 0.3730885684490204, + "learning_rate": 0.00015461142630953954, + "loss": 0.0115, + "step": 1725 + }, + { + "epoch": 0.45613341243161293, + "grad_norm": 0.035858724266290665, + "learning_rate": 0.00015447948278136958, + "loss": 0.0021, + "step": 1730 + }, + { + "epoch": 0.4574517170918199, + "grad_norm": 0.20589517056941986, + "learning_rate": 0.00015434753925319965, + "loss": 0.0132, + "step": 1735 + }, + { + "epoch": 0.4587700217520269, + "grad_norm": 0.004939342383295298, + "learning_rate": 0.00015421559572502972, + "loss": 0.0471, + "step": 1740 + }, + { + "epoch": 0.46008832641223385, + "grad_norm": 0.03493283689022064, + "learning_rate": 0.00015408365219685976, + "loss": 0.0062, + "step": 1745 + }, + { + "epoch": 0.46140663107244084, + "grad_norm": 0.045927103608846664, + "learning_rate": 0.0001539517086686898, + "loss": 0.0283, + "step": 1750 + }, + { + "epoch": 0.46272493573264784, + "grad_norm": 0.012629454955458641, + "learning_rate": 0.00015381976514051984, + "loss": 0.0133, + "step": 1755 + }, + { + "epoch": 0.46404324039285477, + "grad_norm": 0.8001697659492493, + "learning_rate": 0.0001536878216123499, + "loss": 0.0224, + "step": 1760 + }, + { + "epoch": 0.46536154505306176, + "grad_norm": 0.002036362886428833, + "learning_rate": 0.00015355587808417998, + "loss": 0.0066, + "step": 1765 + }, + { + "epoch": 0.46667984971326876, + "grad_norm": 1.0261330604553223, + "learning_rate": 0.00015342393455601002, + "loss": 0.191, + "step": 1770 + }, + { + "epoch": 0.4679981543734757, + "grad_norm": 0.3033429682254791, + "learning_rate": 0.0001532919910278401, + "loss": 0.0222, + "step": 1775 + }, + { + "epoch": 0.4693164590336827, + "grad_norm": 0.36911338567733765, + "learning_rate": 0.00015316004749967014, + "loss": 0.0363, + "step": 1780 + }, + { + "epoch": 0.4706347636938897, + "grad_norm": 0.0406811460852623, + "learning_rate": 0.0001530281039715002, + "loss": 0.0283, + "step": 1785 + }, + { + "epoch": 0.4719530683540966, + "grad_norm": 0.23334211111068726, + "learning_rate": 0.00015289616044333027, + "loss": 0.0274, + "step": 1790 + }, + { + "epoch": 0.4732713730143036, + "grad_norm": 0.013081169687211514, + "learning_rate": 0.00015276421691516032, + "loss": 0.0221, + "step": 1795 + }, + { + "epoch": 0.4745896776745106, + "grad_norm": 0.2480790615081787, + "learning_rate": 0.00015263227338699039, + "loss": 0.019, + "step": 1800 + }, + { + "epoch": 0.47590798233471754, + "grad_norm": 0.0373196005821228, + "learning_rate": 0.00015250032985882043, + "loss": 0.0292, + "step": 1805 + }, + { + "epoch": 0.47722628699492453, + "grad_norm": 0.004609994124621153, + "learning_rate": 0.0001523683863306505, + "loss": 0.0918, + "step": 1810 + }, + { + "epoch": 0.4785445916551315, + "grad_norm": 0.02370987832546234, + "learning_rate": 0.00015223644280248054, + "loss": 0.0462, + "step": 1815 + }, + { + "epoch": 0.47986289631533846, + "grad_norm": 0.05842221528291702, + "learning_rate": 0.0001521044992743106, + "loss": 0.0595, + "step": 1820 + }, + { + "epoch": 0.48118120097554545, + "grad_norm": 0.009685276076197624, + "learning_rate": 0.00015197255574614068, + "loss": 0.0074, + "step": 1825 + }, + { + "epoch": 0.48249950563575245, + "grad_norm": 0.8933250308036804, + "learning_rate": 0.00015184061221797072, + "loss": 0.0757, + "step": 1830 + }, + { + "epoch": 0.4838178102959594, + "grad_norm": 0.07075401395559311, + "learning_rate": 0.0001517086686898008, + "loss": 0.0226, + "step": 1835 + }, + { + "epoch": 0.4851361149561664, + "grad_norm": 0.732706606388092, + "learning_rate": 0.00015157672516163083, + "loss": 0.0161, + "step": 1840 + }, + { + "epoch": 0.48645441961637337, + "grad_norm": 1.1897023916244507, + "learning_rate": 0.0001514447816334609, + "loss": 0.0265, + "step": 1845 + }, + { + "epoch": 0.4877727242765803, + "grad_norm": 0.052572328597307205, + "learning_rate": 0.00015131283810529094, + "loss": 0.0094, + "step": 1850 + }, + { + "epoch": 0.4890910289367873, + "grad_norm": 0.08263898640871048, + "learning_rate": 0.00015118089457712098, + "loss": 0.0631, + "step": 1855 + }, + { + "epoch": 0.4904093335969943, + "grad_norm": 0.03225664421916008, + "learning_rate": 0.00015104895104895105, + "loss": 0.023, + "step": 1860 + }, + { + "epoch": 0.4917276382572012, + "grad_norm": 0.007935039699077606, + "learning_rate": 0.0001509170075207811, + "loss": 0.0039, + "step": 1865 + }, + { + "epoch": 0.4930459429174082, + "grad_norm": 0.00830796267837286, + "learning_rate": 0.00015078506399261116, + "loss": 0.007, + "step": 1870 + }, + { + "epoch": 0.4943642475776152, + "grad_norm": 0.08042234182357788, + "learning_rate": 0.00015065312046444123, + "loss": 0.0366, + "step": 1875 + }, + { + "epoch": 0.49568255223782215, + "grad_norm": 0.009092851541936398, + "learning_rate": 0.00015052117693627128, + "loss": 0.0107, + "step": 1880 + }, + { + "epoch": 0.49700085689802914, + "grad_norm": 0.2674141824245453, + "learning_rate": 0.00015038923340810135, + "loss": 0.0076, + "step": 1885 + }, + { + "epoch": 0.49831916155823613, + "grad_norm": 0.07694366574287415, + "learning_rate": 0.0001502572898799314, + "loss": 0.0252, + "step": 1890 + }, + { + "epoch": 0.49963746621844307, + "grad_norm": 0.5699467062950134, + "learning_rate": 0.00015012534635176146, + "loss": 0.0487, + "step": 1895 + }, + { + "epoch": 0.5009557708786501, + "grad_norm": 0.18800878524780273, + "learning_rate": 0.0001499934028235915, + "loss": 0.0183, + "step": 1900 + }, + { + "epoch": 0.5022740755388571, + "grad_norm": 0.019469989463686943, + "learning_rate": 0.00014986145929542157, + "loss": 0.0268, + "step": 1905 + }, + { + "epoch": 0.503592380199064, + "grad_norm": 0.01890506222844124, + "learning_rate": 0.00014972951576725164, + "loss": 0.0449, + "step": 1910 + }, + { + "epoch": 0.5049106848592709, + "grad_norm": 0.0006314461352303624, + "learning_rate": 0.00014959757223908168, + "loss": 0.0056, + "step": 1915 + }, + { + "epoch": 0.5062289895194779, + "grad_norm": 0.32654041051864624, + "learning_rate": 0.00014946562871091175, + "loss": 0.0256, + "step": 1920 + }, + { + "epoch": 0.5075472941796849, + "grad_norm": 0.7803483605384827, + "learning_rate": 0.0001493336851827418, + "loss": 0.0374, + "step": 1925 + }, + { + "epoch": 0.5088655988398919, + "grad_norm": 0.028441445901989937, + "learning_rate": 0.00014920174165457186, + "loss": 0.0161, + "step": 1930 + }, + { + "epoch": 0.5101839035000989, + "grad_norm": 0.028379200026392937, + "learning_rate": 0.00014906979812640193, + "loss": 0.0151, + "step": 1935 + }, + { + "epoch": 0.5115022081603059, + "grad_norm": 0.021159596741199493, + "learning_rate": 0.00014893785459823197, + "loss": 0.0303, + "step": 1940 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.24903325736522675, + "learning_rate": 0.000148805911070062, + "loss": 0.0076, + "step": 1945 + }, + { + "epoch": 0.5141388174807198, + "grad_norm": 0.007065301761031151, + "learning_rate": 0.00014867396754189206, + "loss": 0.022, + "step": 1950 + }, + { + "epoch": 0.5154571221409268, + "grad_norm": 0.004032329190522432, + "learning_rate": 0.00014854202401372212, + "loss": 0.0083, + "step": 1955 + }, + { + "epoch": 0.5167754268011338, + "grad_norm": 0.3045775592327118, + "learning_rate": 0.0001484100804855522, + "loss": 0.0113, + "step": 1960 + }, + { + "epoch": 0.5180937314613407, + "grad_norm": 0.36974939703941345, + "learning_rate": 0.00014827813695738224, + "loss": 0.0267, + "step": 1965 + }, + { + "epoch": 0.5194120361215477, + "grad_norm": 0.009729950688779354, + "learning_rate": 0.0001481461934292123, + "loss": 0.027, + "step": 1970 + }, + { + "epoch": 0.5207303407817546, + "grad_norm": 0.0013097926275804639, + "learning_rate": 0.00014801424990104235, + "loss": 0.003, + "step": 1975 + }, + { + "epoch": 0.5220486454419616, + "grad_norm": 0.0706263929605484, + "learning_rate": 0.00014788230637287242, + "loss": 0.0193, + "step": 1980 + }, + { + "epoch": 0.5233669501021686, + "grad_norm": 1.435702919960022, + "learning_rate": 0.00014775036284470249, + "loss": 0.0647, + "step": 1985 + }, + { + "epoch": 0.5246852547623756, + "grad_norm": 0.00661757867783308, + "learning_rate": 0.00014761841931653253, + "loss": 0.0373, + "step": 1990 + }, + { + "epoch": 0.5260035594225826, + "grad_norm": 0.12014541029930115, + "learning_rate": 0.0001474864757883626, + "loss": 0.0178, + "step": 1995 + }, + { + "epoch": 0.5273218640827896, + "grad_norm": 1.0549248456954956, + "learning_rate": 0.00014735453226019264, + "loss": 0.0191, + "step": 2000 + }, + { + "epoch": 0.5273218640827896, + "eval_loss": 0.037292081862688065, + "eval_runtime": 454.3033, + "eval_samples_per_second": 7.422, + "eval_steps_per_second": 3.711, + "step": 2000 + }, + { + "epoch": 0.5286401687429965, + "grad_norm": 0.47634151577949524, + "learning_rate": 0.0001472225887320227, + "loss": 0.0404, + "step": 2005 + }, + { + "epoch": 0.5299584734032035, + "grad_norm": 0.006752463988959789, + "learning_rate": 0.00014709064520385275, + "loss": 0.034, + "step": 2010 + }, + { + "epoch": 0.5312767780634104, + "grad_norm": 0.20780125260353088, + "learning_rate": 0.00014695870167568282, + "loss": 0.0421, + "step": 2015 + }, + { + "epoch": 0.5325950827236174, + "grad_norm": 0.010941066779196262, + "learning_rate": 0.0001468267581475129, + "loss": 0.0086, + "step": 2020 + }, + { + "epoch": 0.5339133873838244, + "grad_norm": 0.3439581096172333, + "learning_rate": 0.00014669481461934293, + "loss": 0.0187, + "step": 2025 + }, + { + "epoch": 0.5352316920440314, + "grad_norm": 0.14961636066436768, + "learning_rate": 0.000146562871091173, + "loss": 0.0504, + "step": 2030 + }, + { + "epoch": 0.5365499967042383, + "grad_norm": 0.0044641937129199505, + "learning_rate": 0.00014643092756300304, + "loss": 0.0134, + "step": 2035 + }, + { + "epoch": 0.5378683013644453, + "grad_norm": 0.14088386297225952, + "learning_rate": 0.0001462989840348331, + "loss": 0.0096, + "step": 2040 + }, + { + "epoch": 0.5391866060246523, + "grad_norm": 0.48116979002952576, + "learning_rate": 0.00014616704050666315, + "loss": 0.0124, + "step": 2045 + }, + { + "epoch": 0.5405049106848593, + "grad_norm": 0.3688766360282898, + "learning_rate": 0.0001460350969784932, + "loss": 0.0226, + "step": 2050 + }, + { + "epoch": 0.5418232153450663, + "grad_norm": 0.002938181860372424, + "learning_rate": 0.00014590315345032326, + "loss": 0.0267, + "step": 2055 + }, + { + "epoch": 0.5431415200052733, + "grad_norm": 0.3335214853286743, + "learning_rate": 0.0001457712099221533, + "loss": 0.0367, + "step": 2060 + }, + { + "epoch": 0.5444598246654802, + "grad_norm": 0.004644686821848154, + "learning_rate": 0.00014563926639398338, + "loss": 0.0121, + "step": 2065 + }, + { + "epoch": 0.5457781293256871, + "grad_norm": 0.19505545496940613, + "learning_rate": 0.00014550732286581345, + "loss": 0.0591, + "step": 2070 + }, + { + "epoch": 0.5470964339858941, + "grad_norm": 0.018028756603598595, + "learning_rate": 0.0001453753793376435, + "loss": 0.0131, + "step": 2075 + }, + { + "epoch": 0.5484147386461011, + "grad_norm": 0.045639291405677795, + "learning_rate": 0.00014524343580947356, + "loss": 0.0443, + "step": 2080 + }, + { + "epoch": 0.5497330433063081, + "grad_norm": 0.727981686592102, + "learning_rate": 0.0001451114922813036, + "loss": 0.0205, + "step": 2085 + }, + { + "epoch": 0.5510513479665151, + "grad_norm": 0.03766491636633873, + "learning_rate": 0.00014497954875313367, + "loss": 0.0067, + "step": 2090 + }, + { + "epoch": 0.552369652626722, + "grad_norm": 0.1911504715681076, + "learning_rate": 0.0001448476052249637, + "loss": 0.0397, + "step": 2095 + }, + { + "epoch": 0.553687957286929, + "grad_norm": 0.08238353580236435, + "learning_rate": 0.00014471566169679378, + "loss": 0.0513, + "step": 2100 + }, + { + "epoch": 0.555006261947136, + "grad_norm": 0.06317206472158432, + "learning_rate": 0.00014458371816862385, + "loss": 0.0178, + "step": 2105 + }, + { + "epoch": 0.556324566607343, + "grad_norm": 0.0652734637260437, + "learning_rate": 0.0001444517746404539, + "loss": 0.0184, + "step": 2110 + }, + { + "epoch": 0.55764287126755, + "grad_norm": 0.05471858009696007, + "learning_rate": 0.00014431983111228396, + "loss": 0.0089, + "step": 2115 + }, + { + "epoch": 0.558961175927757, + "grad_norm": 0.005062670446932316, + "learning_rate": 0.000144187887584114, + "loss": 0.0052, + "step": 2120 + }, + { + "epoch": 0.5602794805879638, + "grad_norm": 0.06337414681911469, + "learning_rate": 0.00014405594405594407, + "loss": 0.053, + "step": 2125 + }, + { + "epoch": 0.5615977852481708, + "grad_norm": 0.33745357394218445, + "learning_rate": 0.00014392400052777414, + "loss": 0.0166, + "step": 2130 + }, + { + "epoch": 0.5629160899083778, + "grad_norm": 0.7382741570472717, + "learning_rate": 0.00014379205699960418, + "loss": 0.0191, + "step": 2135 + }, + { + "epoch": 0.5642343945685848, + "grad_norm": 0.007551972754299641, + "learning_rate": 0.00014366011347143425, + "loss": 0.0022, + "step": 2140 + }, + { + "epoch": 0.5655526992287918, + "grad_norm": 0.6260896921157837, + "learning_rate": 0.00014352816994326427, + "loss": 0.0095, + "step": 2145 + }, + { + "epoch": 0.5668710038889987, + "grad_norm": 0.11619322001934052, + "learning_rate": 0.00014339622641509434, + "loss": 0.015, + "step": 2150 + }, + { + "epoch": 0.5681893085492057, + "grad_norm": 1.1440670490264893, + "learning_rate": 0.0001432642828869244, + "loss": 0.1343, + "step": 2155 + }, + { + "epoch": 0.5695076132094127, + "grad_norm": 1.1793878078460693, + "learning_rate": 0.00014313233935875445, + "loss": 0.0968, + "step": 2160 + }, + { + "epoch": 0.5708259178696197, + "grad_norm": 0.6865736842155457, + "learning_rate": 0.00014300039583058452, + "loss": 0.0195, + "step": 2165 + }, + { + "epoch": 0.5721442225298267, + "grad_norm": 0.140816792845726, + "learning_rate": 0.00014286845230241456, + "loss": 0.0761, + "step": 2170 + }, + { + "epoch": 0.5734625271900337, + "grad_norm": 0.04071786254644394, + "learning_rate": 0.00014273650877424463, + "loss": 0.0193, + "step": 2175 + }, + { + "epoch": 0.5747808318502405, + "grad_norm": 0.044617727398872375, + "learning_rate": 0.0001426045652460747, + "loss": 0.0112, + "step": 2180 + }, + { + "epoch": 0.5760991365104475, + "grad_norm": 0.11001799255609512, + "learning_rate": 0.00014247262171790474, + "loss": 0.0039, + "step": 2185 + }, + { + "epoch": 0.5774174411706545, + "grad_norm": 0.0036315324250608683, + "learning_rate": 0.0001423406781897348, + "loss": 0.0038, + "step": 2190 + }, + { + "epoch": 0.5787357458308615, + "grad_norm": 0.9866570830345154, + "learning_rate": 0.00014220873466156485, + "loss": 0.025, + "step": 2195 + }, + { + "epoch": 0.5800540504910685, + "grad_norm": 0.023570384830236435, + "learning_rate": 0.00014207679113339492, + "loss": 0.0468, + "step": 2200 + }, + { + "epoch": 0.5813723551512755, + "grad_norm": 0.20010559260845184, + "learning_rate": 0.00014194484760522496, + "loss": 0.0198, + "step": 2205 + }, + { + "epoch": 0.5826906598114824, + "grad_norm": 0.06153270602226257, + "learning_rate": 0.00014181290407705503, + "loss": 0.0764, + "step": 2210 + }, + { + "epoch": 0.5840089644716894, + "grad_norm": 0.033162448555231094, + "learning_rate": 0.0001416809605488851, + "loss": 0.028, + "step": 2215 + }, + { + "epoch": 0.5853272691318964, + "grad_norm": 0.428382933139801, + "learning_rate": 0.00014154901702071514, + "loss": 0.0652, + "step": 2220 + }, + { + "epoch": 0.5866455737921034, + "grad_norm": 0.25004762411117554, + "learning_rate": 0.0001414170734925452, + "loss": 0.0411, + "step": 2225 + }, + { + "epoch": 0.5879638784523104, + "grad_norm": 0.22649863362312317, + "learning_rate": 0.00014128512996437525, + "loss": 0.0517, + "step": 2230 + }, + { + "epoch": 0.5892821831125173, + "grad_norm": 0.035932112485170364, + "learning_rate": 0.00014115318643620532, + "loss": 0.015, + "step": 2235 + }, + { + "epoch": 0.5906004877727242, + "grad_norm": 0.3800172507762909, + "learning_rate": 0.00014102124290803536, + "loss": 0.0324, + "step": 2240 + }, + { + "epoch": 0.5919187924329312, + "grad_norm": 0.6974118947982788, + "learning_rate": 0.0001408892993798654, + "loss": 0.0216, + "step": 2245 + }, + { + "epoch": 0.5932370970931382, + "grad_norm": 0.15472032129764557, + "learning_rate": 0.00014075735585169548, + "loss": 0.0164, + "step": 2250 + }, + { + "epoch": 0.5945554017533452, + "grad_norm": 0.015000814571976662, + "learning_rate": 0.00014062541232352552, + "loss": 0.0395, + "step": 2255 + }, + { + "epoch": 0.5958737064135522, + "grad_norm": 0.052086081355810165, + "learning_rate": 0.0001404934687953556, + "loss": 0.0032, + "step": 2260 + }, + { + "epoch": 0.5971920110737592, + "grad_norm": 0.004600350745022297, + "learning_rate": 0.00014036152526718566, + "loss": 0.0056, + "step": 2265 + }, + { + "epoch": 0.5985103157339661, + "grad_norm": 0.4940958321094513, + "learning_rate": 0.0001402295817390157, + "loss": 0.0206, + "step": 2270 + }, + { + "epoch": 0.5998286203941731, + "grad_norm": 0.09658394008874893, + "learning_rate": 0.00014009763821084577, + "loss": 0.0052, + "step": 2275 + }, + { + "epoch": 0.60114692505438, + "grad_norm": 0.00020539117394946516, + "learning_rate": 0.0001399656946826758, + "loss": 0.087, + "step": 2280 + }, + { + "epoch": 0.602465229714587, + "grad_norm": 0.1871018409729004, + "learning_rate": 0.00013983375115450588, + "loss": 0.0812, + "step": 2285 + }, + { + "epoch": 0.603783534374794, + "grad_norm": 0.02583954855799675, + "learning_rate": 0.00013970180762633592, + "loss": 0.0232, + "step": 2290 + }, + { + "epoch": 0.605101839035001, + "grad_norm": 1.2103784084320068, + "learning_rate": 0.000139569864098166, + "loss": 0.0151, + "step": 2295 + }, + { + "epoch": 0.6064201436952079, + "grad_norm": 0.023514943197369576, + "learning_rate": 0.00013943792056999606, + "loss": 0.0193, + "step": 2300 + }, + { + "epoch": 0.6077384483554149, + "grad_norm": 0.0076395305804908276, + "learning_rate": 0.0001393059770418261, + "loss": 0.0379, + "step": 2305 + }, + { + "epoch": 0.6090567530156219, + "grad_norm": 0.12412039190530777, + "learning_rate": 0.00013917403351365617, + "loss": 0.0095, + "step": 2310 + }, + { + "epoch": 0.6103750576758289, + "grad_norm": 0.021904783323407173, + "learning_rate": 0.0001390420899854862, + "loss": 0.0166, + "step": 2315 + }, + { + "epoch": 0.6116933623360359, + "grad_norm": 0.004012851510196924, + "learning_rate": 0.00013891014645731628, + "loss": 0.0103, + "step": 2320 + }, + { + "epoch": 0.6130116669962429, + "grad_norm": 0.007267913781106472, + "learning_rate": 0.00013877820292914635, + "loss": 0.0708, + "step": 2325 + }, + { + "epoch": 0.6143299716564498, + "grad_norm": 0.10363642126321793, + "learning_rate": 0.0001386462594009764, + "loss": 0.0473, + "step": 2330 + }, + { + "epoch": 0.6156482763166568, + "grad_norm": 0.04899830371141434, + "learning_rate": 0.00013851431587280646, + "loss": 0.0283, + "step": 2335 + }, + { + "epoch": 0.6169665809768637, + "grad_norm": 0.39460498094558716, + "learning_rate": 0.0001383823723446365, + "loss": 0.0597, + "step": 2340 + }, + { + "epoch": 0.6182848856370707, + "grad_norm": 0.04092290997505188, + "learning_rate": 0.00013825042881646655, + "loss": 0.0167, + "step": 2345 + }, + { + "epoch": 0.6196031902972777, + "grad_norm": 0.2781132161617279, + "learning_rate": 0.00013811848528829662, + "loss": 0.0097, + "step": 2350 + }, + { + "epoch": 0.6209214949574847, + "grad_norm": 0.041443537920713425, + "learning_rate": 0.00013798654176012666, + "loss": 0.0226, + "step": 2355 + }, + { + "epoch": 0.6222397996176916, + "grad_norm": 0.1242462694644928, + "learning_rate": 0.00013785459823195673, + "loss": 0.0055, + "step": 2360 + }, + { + "epoch": 0.6235581042778986, + "grad_norm": 0.4440467357635498, + "learning_rate": 0.00013772265470378677, + "loss": 0.049, + "step": 2365 + }, + { + "epoch": 0.6248764089381056, + "grad_norm": 0.014354427345097065, + "learning_rate": 0.00013759071117561684, + "loss": 0.0327, + "step": 2370 + }, + { + "epoch": 0.6261947135983126, + "grad_norm": 0.011539973318576813, + "learning_rate": 0.0001374587676474469, + "loss": 0.0222, + "step": 2375 + }, + { + "epoch": 0.6275130182585196, + "grad_norm": 0.23539051413536072, + "learning_rate": 0.00013732682411927695, + "loss": 0.0816, + "step": 2380 + }, + { + "epoch": 0.6288313229187266, + "grad_norm": 0.26793941855430603, + "learning_rate": 0.00013719488059110702, + "loss": 0.0325, + "step": 2385 + }, + { + "epoch": 0.6301496275789334, + "grad_norm": 0.01662217453122139, + "learning_rate": 0.00013706293706293706, + "loss": 0.0221, + "step": 2390 + }, + { + "epoch": 0.6314679322391404, + "grad_norm": 0.30669671297073364, + "learning_rate": 0.00013693099353476713, + "loss": 0.026, + "step": 2395 + }, + { + "epoch": 0.6327862368993474, + "grad_norm": 0.03350894898176193, + "learning_rate": 0.00013679905000659717, + "loss": 0.0072, + "step": 2400 + }, + { + "epoch": 0.6341045415595544, + "grad_norm": 0.014983875676989555, + "learning_rate": 0.00013666710647842724, + "loss": 0.049, + "step": 2405 + }, + { + "epoch": 0.6354228462197614, + "grad_norm": 1.8989384174346924, + "learning_rate": 0.0001365351629502573, + "loss": 0.0335, + "step": 2410 + }, + { + "epoch": 0.6367411508799684, + "grad_norm": 0.030135562643408775, + "learning_rate": 0.00013640321942208735, + "loss": 0.0051, + "step": 2415 + }, + { + "epoch": 0.6380594555401753, + "grad_norm": 0.02079075388610363, + "learning_rate": 0.00013627127589391742, + "loss": 0.0138, + "step": 2420 + }, + { + "epoch": 0.6393777602003823, + "grad_norm": 0.06065403297543526, + "learning_rate": 0.00013613933236574746, + "loss": 0.0357, + "step": 2425 + }, + { + "epoch": 0.6406960648605893, + "grad_norm": 0.2980937659740448, + "learning_rate": 0.00013600738883757753, + "loss": 0.0138, + "step": 2430 + }, + { + "epoch": 0.6420143695207963, + "grad_norm": 0.4820438623428345, + "learning_rate": 0.00013587544530940758, + "loss": 0.01, + "step": 2435 + }, + { + "epoch": 0.6433326741810033, + "grad_norm": 0.005618259310722351, + "learning_rate": 0.00013574350178123765, + "loss": 0.0052, + "step": 2440 + }, + { + "epoch": 0.6446509788412103, + "grad_norm": 0.7173821926116943, + "learning_rate": 0.0001356115582530677, + "loss": 0.0133, + "step": 2445 + }, + { + "epoch": 0.6459692835014171, + "grad_norm": 0.0053142281249165535, + "learning_rate": 0.00013547961472489773, + "loss": 0.0045, + "step": 2450 + }, + { + "epoch": 0.6472875881616241, + "grad_norm": 0.06118829548358917, + "learning_rate": 0.0001353476711967278, + "loss": 0.056, + "step": 2455 + }, + { + "epoch": 0.6486058928218311, + "grad_norm": 3.5878078937530518, + "learning_rate": 0.00013521572766855787, + "loss": 0.0232, + "step": 2460 + }, + { + "epoch": 0.6499241974820381, + "grad_norm": 0.004911276511847973, + "learning_rate": 0.0001350837841403879, + "loss": 0.0074, + "step": 2465 + }, + { + "epoch": 0.6512425021422451, + "grad_norm": 0.0028026222717016935, + "learning_rate": 0.00013495184061221798, + "loss": 0.0782, + "step": 2470 + }, + { + "epoch": 0.6525608068024521, + "grad_norm": 0.7317615747451782, + "learning_rate": 0.00013481989708404802, + "loss": 0.0222, + "step": 2475 + }, + { + "epoch": 0.653879111462659, + "grad_norm": 0.01835751160979271, + "learning_rate": 0.0001346879535558781, + "loss": 0.0661, + "step": 2480 + }, + { + "epoch": 0.655197416122866, + "grad_norm": 0.03598962351679802, + "learning_rate": 0.00013455601002770813, + "loss": 0.0395, + "step": 2485 + }, + { + "epoch": 0.656515720783073, + "grad_norm": 0.013886351138353348, + "learning_rate": 0.0001344240664995382, + "loss": 0.0156, + "step": 2490 + }, + { + "epoch": 0.65783402544328, + "grad_norm": 5.741530895233154, + "learning_rate": 0.00013429212297136827, + "loss": 0.0317, + "step": 2495 + }, + { + "epoch": 0.659152330103487, + "grad_norm": 0.20793496072292328, + "learning_rate": 0.0001341601794431983, + "loss": 0.0072, + "step": 2500 + }, + { + "epoch": 0.659152330103487, + "eval_loss": 0.0300898440182209, + "eval_runtime": 453.0554, + "eval_samples_per_second": 7.443, + "eval_steps_per_second": 3.721, + "step": 2500 + }, + { + "epoch": 0.6604706347636939, + "grad_norm": 0.03460961952805519, + "learning_rate": 0.00013402823591502838, + "loss": 0.0097, + "step": 2505 + }, + { + "epoch": 0.6617889394239008, + "grad_norm": 0.31785696744918823, + "learning_rate": 0.00013389629238685842, + "loss": 0.0303, + "step": 2510 + }, + { + "epoch": 0.6631072440841078, + "grad_norm": 0.4273851215839386, + "learning_rate": 0.0001337643488586885, + "loss": 0.0499, + "step": 2515 + }, + { + "epoch": 0.6644255487443148, + "grad_norm": 0.02236153744161129, + "learning_rate": 0.00013363240533051856, + "loss": 0.0069, + "step": 2520 + }, + { + "epoch": 0.6657438534045218, + "grad_norm": 0.1592864990234375, + "learning_rate": 0.0001335004618023486, + "loss": 0.0326, + "step": 2525 + }, + { + "epoch": 0.6670621580647288, + "grad_norm": 0.029961545020341873, + "learning_rate": 0.00013336851827417867, + "loss": 0.0178, + "step": 2530 + }, + { + "epoch": 0.6683804627249358, + "grad_norm": 0.03120764158666134, + "learning_rate": 0.00013323657474600872, + "loss": 0.115, + "step": 2535 + }, + { + "epoch": 0.6696987673851427, + "grad_norm": 0.01060028001666069, + "learning_rate": 0.00013310463121783879, + "loss": 0.0036, + "step": 2540 + }, + { + "epoch": 0.6710170720453497, + "grad_norm": 0.053470809012651443, + "learning_rate": 0.00013297268768966883, + "loss": 0.0079, + "step": 2545 + }, + { + "epoch": 0.6723353767055567, + "grad_norm": 0.022777097299695015, + "learning_rate": 0.00013284074416149887, + "loss": 0.0078, + "step": 2550 + }, + { + "epoch": 0.6736536813657636, + "grad_norm": 0.0548521913588047, + "learning_rate": 0.00013270880063332894, + "loss": 0.0503, + "step": 2555 + }, + { + "epoch": 0.6749719860259706, + "grad_norm": 0.02028457075357437, + "learning_rate": 0.00013257685710515898, + "loss": 0.0096, + "step": 2560 + }, + { + "epoch": 0.6762902906861776, + "grad_norm": 0.01569107361137867, + "learning_rate": 0.00013244491357698905, + "loss": 0.008, + "step": 2565 + }, + { + "epoch": 0.6776085953463845, + "grad_norm": 0.00743742985650897, + "learning_rate": 0.00013231297004881912, + "loss": 0.005, + "step": 2570 + }, + { + "epoch": 0.6789269000065915, + "grad_norm": 0.025164416059851646, + "learning_rate": 0.00013218102652064916, + "loss": 0.018, + "step": 2575 + }, + { + "epoch": 0.6802452046667985, + "grad_norm": 0.3653188645839691, + "learning_rate": 0.00013204908299247923, + "loss": 0.0295, + "step": 2580 + }, + { + "epoch": 0.6815635093270055, + "grad_norm": 0.685422956943512, + "learning_rate": 0.00013191713946430927, + "loss": 0.0335, + "step": 2585 + }, + { + "epoch": 0.6828818139872125, + "grad_norm": 0.675740122795105, + "learning_rate": 0.00013178519593613934, + "loss": 0.0592, + "step": 2590 + }, + { + "epoch": 0.6842001186474194, + "grad_norm": 0.10513252764940262, + "learning_rate": 0.00013165325240796938, + "loss": 0.0353, + "step": 2595 + }, + { + "epoch": 0.6855184233076264, + "grad_norm": 0.43512973189353943, + "learning_rate": 0.00013152130887979945, + "loss": 0.0142, + "step": 2600 + }, + { + "epoch": 0.6868367279678333, + "grad_norm": 0.029436839744448662, + "learning_rate": 0.00013138936535162952, + "loss": 0.0042, + "step": 2605 + }, + { + "epoch": 0.6881550326280403, + "grad_norm": 0.5607122778892517, + "learning_rate": 0.00013125742182345957, + "loss": 0.0184, + "step": 2610 + }, + { + "epoch": 0.6894733372882473, + "grad_norm": 0.11365406215190887, + "learning_rate": 0.00013112547829528963, + "loss": 0.006, + "step": 2615 + }, + { + "epoch": 0.6907916419484543, + "grad_norm": 0.047227244824171066, + "learning_rate": 0.00013099353476711968, + "loss": 0.008, + "step": 2620 + }, + { + "epoch": 0.6921099466086612, + "grad_norm": 0.0005877618095837533, + "learning_rate": 0.00013086159123894975, + "loss": 0.0286, + "step": 2625 + }, + { + "epoch": 0.6934282512688682, + "grad_norm": 0.010759112425148487, + "learning_rate": 0.0001307296477107798, + "loss": 0.0062, + "step": 2630 + }, + { + "epoch": 0.6947465559290752, + "grad_norm": 0.07117745280265808, + "learning_rate": 0.00013059770418260986, + "loss": 0.0891, + "step": 2635 + }, + { + "epoch": 0.6960648605892822, + "grad_norm": 0.0639057606458664, + "learning_rate": 0.00013046576065443993, + "loss": 0.0072, + "step": 2640 + }, + { + "epoch": 0.6973831652494892, + "grad_norm": 0.027350090444087982, + "learning_rate": 0.00013033381712626994, + "loss": 0.0103, + "step": 2645 + }, + { + "epoch": 0.6987014699096962, + "grad_norm": 0.015336195938289165, + "learning_rate": 0.0001302018735981, + "loss": 0.0041, + "step": 2650 + }, + { + "epoch": 0.700019774569903, + "grad_norm": 1.0650830268859863, + "learning_rate": 0.00013006993006993008, + "loss": 0.0443, + "step": 2655 + }, + { + "epoch": 0.70133807923011, + "grad_norm": 0.019073212519288063, + "learning_rate": 0.00012993798654176012, + "loss": 0.0331, + "step": 2660 + }, + { + "epoch": 0.702656383890317, + "grad_norm": 0.10109209269285202, + "learning_rate": 0.0001298060430135902, + "loss": 0.0054, + "step": 2665 + }, + { + "epoch": 0.703974688550524, + "grad_norm": 0.03528957813978195, + "learning_rate": 0.00012967409948542023, + "loss": 0.0427, + "step": 2670 + }, + { + "epoch": 0.705292993210731, + "grad_norm": 0.03577788919210434, + "learning_rate": 0.0001295421559572503, + "loss": 0.023, + "step": 2675 + }, + { + "epoch": 0.706611297870938, + "grad_norm": 0.5576180815696716, + "learning_rate": 0.00012941021242908034, + "loss": 0.0416, + "step": 2680 + }, + { + "epoch": 0.7079296025311449, + "grad_norm": 0.017131298780441284, + "learning_rate": 0.0001292782689009104, + "loss": 0.0235, + "step": 2685 + }, + { + "epoch": 0.7092479071913519, + "grad_norm": 0.8517888784408569, + "learning_rate": 0.00012914632537274048, + "loss": 0.0168, + "step": 2690 + }, + { + "epoch": 0.7105662118515589, + "grad_norm": 0.23812156915664673, + "learning_rate": 0.00012901438184457052, + "loss": 0.0483, + "step": 2695 + }, + { + "epoch": 0.7118845165117659, + "grad_norm": 0.11746613681316376, + "learning_rate": 0.0001288824383164006, + "loss": 0.0255, + "step": 2700 + }, + { + "epoch": 0.7132028211719729, + "grad_norm": 0.20089928805828094, + "learning_rate": 0.00012875049478823064, + "loss": 0.0267, + "step": 2705 + }, + { + "epoch": 0.7145211258321799, + "grad_norm": 0.8301129937171936, + "learning_rate": 0.0001286185512600607, + "loss": 0.016, + "step": 2710 + }, + { + "epoch": 0.7158394304923867, + "grad_norm": 0.01838674768805504, + "learning_rate": 0.00012848660773189077, + "loss": 0.0229, + "step": 2715 + }, + { + "epoch": 0.7171577351525937, + "grad_norm": 0.03670337051153183, + "learning_rate": 0.00012835466420372082, + "loss": 0.038, + "step": 2720 + }, + { + "epoch": 0.7184760398128007, + "grad_norm": 0.0452633760869503, + "learning_rate": 0.00012822272067555089, + "loss": 0.0622, + "step": 2725 + }, + { + "epoch": 0.7197943444730077, + "grad_norm": 0.09503110498189926, + "learning_rate": 0.00012809077714738093, + "loss": 0.0209, + "step": 2730 + }, + { + "epoch": 0.7211126491332147, + "grad_norm": 1.0327308177947998, + "learning_rate": 0.000127958833619211, + "loss": 0.0361, + "step": 2735 + }, + { + "epoch": 0.7224309537934217, + "grad_norm": 1.0049290657043457, + "learning_rate": 0.00012782689009104104, + "loss": 0.0365, + "step": 2740 + }, + { + "epoch": 0.7237492584536286, + "grad_norm": 0.029774073511362076, + "learning_rate": 0.00012769494656287108, + "loss": 0.0257, + "step": 2745 + }, + { + "epoch": 0.7250675631138356, + "grad_norm": 0.20974040031433105, + "learning_rate": 0.00012756300303470115, + "loss": 0.0542, + "step": 2750 + }, + { + "epoch": 0.7263858677740426, + "grad_norm": 0.8153854608535767, + "learning_rate": 0.0001274310595065312, + "loss": 0.0216, + "step": 2755 + }, + { + "epoch": 0.7277041724342496, + "grad_norm": 0.4393698573112488, + "learning_rate": 0.00012729911597836126, + "loss": 0.0451, + "step": 2760 + }, + { + "epoch": 0.7290224770944566, + "grad_norm": 0.06990349292755127, + "learning_rate": 0.00012716717245019133, + "loss": 0.03, + "step": 2765 + }, + { + "epoch": 0.7303407817546635, + "grad_norm": 0.32689470052719116, + "learning_rate": 0.00012703522892202137, + "loss": 0.0263, + "step": 2770 + }, + { + "epoch": 0.7316590864148704, + "grad_norm": 0.026600876823067665, + "learning_rate": 0.00012690328539385144, + "loss": 0.0404, + "step": 2775 + }, + { + "epoch": 0.7329773910750774, + "grad_norm": 0.11228257417678833, + "learning_rate": 0.00012677134186568148, + "loss": 0.0224, + "step": 2780 + }, + { + "epoch": 0.7342956957352844, + "grad_norm": 0.6469443440437317, + "learning_rate": 0.00012663939833751155, + "loss": 0.0178, + "step": 2785 + }, + { + "epoch": 0.7356140003954914, + "grad_norm": 0.020773250609636307, + "learning_rate": 0.0001265074548093416, + "loss": 0.011, + "step": 2790 + }, + { + "epoch": 0.7369323050556984, + "grad_norm": 0.7378728985786438, + "learning_rate": 0.00012637551128117167, + "loss": 0.0227, + "step": 2795 + }, + { + "epoch": 0.7382506097159054, + "grad_norm": 0.008189595304429531, + "learning_rate": 0.00012624356775300173, + "loss": 0.0892, + "step": 2800 + }, + { + "epoch": 0.7395689143761123, + "grad_norm": 0.031633853912353516, + "learning_rate": 0.00012611162422483178, + "loss": 0.0093, + "step": 2805 + }, + { + "epoch": 0.7408872190363193, + "grad_norm": 0.5078475475311279, + "learning_rate": 0.00012597968069666185, + "loss": 0.0567, + "step": 2810 + }, + { + "epoch": 0.7422055236965263, + "grad_norm": 0.21766887605190277, + "learning_rate": 0.0001258477371684919, + "loss": 0.0485, + "step": 2815 + }, + { + "epoch": 0.7435238283567333, + "grad_norm": 0.3029612898826599, + "learning_rate": 0.00012571579364032196, + "loss": 0.032, + "step": 2820 + }, + { + "epoch": 0.7448421330169402, + "grad_norm": 1.2135159969329834, + "learning_rate": 0.00012558385011215203, + "loss": 0.0139, + "step": 2825 + }, + { + "epoch": 0.7461604376771472, + "grad_norm": 0.016875172033905983, + "learning_rate": 0.00012545190658398207, + "loss": 0.0323, + "step": 2830 + }, + { + "epoch": 0.7474787423373541, + "grad_norm": 0.08923230320215225, + "learning_rate": 0.00012531996305581214, + "loss": 0.0343, + "step": 2835 + }, + { + "epoch": 0.7487970469975611, + "grad_norm": 0.2958766520023346, + "learning_rate": 0.00012518801952764215, + "loss": 0.0431, + "step": 2840 + }, + { + "epoch": 0.7501153516577681, + "grad_norm": 0.7344386577606201, + "learning_rate": 0.00012505607599947222, + "loss": 0.0389, + "step": 2845 + }, + { + "epoch": 0.7514336563179751, + "grad_norm": 0.03681635856628418, + "learning_rate": 0.0001249241324713023, + "loss": 0.0258, + "step": 2850 + }, + { + "epoch": 0.7527519609781821, + "grad_norm": 0.22866861522197723, + "learning_rate": 0.00012479218894313233, + "loss": 0.0223, + "step": 2855 + }, + { + "epoch": 0.7540702656383891, + "grad_norm": 0.029770435765385628, + "learning_rate": 0.0001246602454149624, + "loss": 0.0205, + "step": 2860 + }, + { + "epoch": 0.755388570298596, + "grad_norm": 0.011845707893371582, + "learning_rate": 0.00012452830188679244, + "loss": 0.0252, + "step": 2865 + }, + { + "epoch": 0.756706874958803, + "grad_norm": 0.06696149706840515, + "learning_rate": 0.00012439635835862251, + "loss": 0.0166, + "step": 2870 + }, + { + "epoch": 0.75802517961901, + "grad_norm": 0.01653144136071205, + "learning_rate": 0.00012426441483045256, + "loss": 0.0487, + "step": 2875 + }, + { + "epoch": 0.7593434842792169, + "grad_norm": 0.031312476843595505, + "learning_rate": 0.00012413247130228263, + "loss": 0.0155, + "step": 2880 + }, + { + "epoch": 0.7606617889394239, + "grad_norm": 0.011625733226537704, + "learning_rate": 0.0001240005277741127, + "loss": 0.0333, + "step": 2885 + }, + { + "epoch": 0.7619800935996309, + "grad_norm": 0.012089414522051811, + "learning_rate": 0.00012386858424594274, + "loss": 0.003, + "step": 2890 + }, + { + "epoch": 0.7632983982598378, + "grad_norm": 0.3012307584285736, + "learning_rate": 0.0001237366407177728, + "loss": 0.0172, + "step": 2895 + }, + { + "epoch": 0.7646167029200448, + "grad_norm": 0.31575000286102295, + "learning_rate": 0.00012360469718960285, + "loss": 0.0409, + "step": 2900 + }, + { + "epoch": 0.7659350075802518, + "grad_norm": 0.009794364683330059, + "learning_rate": 0.00012347275366143292, + "loss": 0.0214, + "step": 2905 + }, + { + "epoch": 0.7672533122404588, + "grad_norm": 0.5973085165023804, + "learning_rate": 0.00012334081013326299, + "loss": 0.0245, + "step": 2910 + }, + { + "epoch": 0.7685716169006658, + "grad_norm": 0.019750040024518967, + "learning_rate": 0.00012320886660509303, + "loss": 0.0063, + "step": 2915 + }, + { + "epoch": 0.7698899215608728, + "grad_norm": 0.06402858346700668, + "learning_rate": 0.0001230769230769231, + "loss": 0.0444, + "step": 2920 + }, + { + "epoch": 0.7712082262210797, + "grad_norm": 0.02876671403646469, + "learning_rate": 0.00012294497954875314, + "loss": 0.0103, + "step": 2925 + }, + { + "epoch": 0.7725265308812866, + "grad_norm": 0.6962207555770874, + "learning_rate": 0.0001228130360205832, + "loss": 0.0318, + "step": 2930 + }, + { + "epoch": 0.7738448355414936, + "grad_norm": 0.006536522414535284, + "learning_rate": 0.00012268109249241325, + "loss": 0.0096, + "step": 2935 + }, + { + "epoch": 0.7751631402017006, + "grad_norm": 0.07097168266773224, + "learning_rate": 0.0001225491489642433, + "loss": 0.0174, + "step": 2940 + }, + { + "epoch": 0.7764814448619076, + "grad_norm": 0.042360126972198486, + "learning_rate": 0.00012241720543607336, + "loss": 0.0158, + "step": 2945 + }, + { + "epoch": 0.7777997495221146, + "grad_norm": 0.01159572321921587, + "learning_rate": 0.0001222852619079034, + "loss": 0.0265, + "step": 2950 + }, + { + "epoch": 0.7791180541823215, + "grad_norm": 0.38408163189888, + "learning_rate": 0.00012215331837973347, + "loss": 0.0233, + "step": 2955 + }, + { + "epoch": 0.7804363588425285, + "grad_norm": 0.15588605403900146, + "learning_rate": 0.00012202137485156353, + "loss": 0.0041, + "step": 2960 + }, + { + "epoch": 0.7817546635027355, + "grad_norm": 0.006892362609505653, + "learning_rate": 0.00012188943132339358, + "loss": 0.0026, + "step": 2965 + }, + { + "epoch": 0.7830729681629425, + "grad_norm": 0.030915727838873863, + "learning_rate": 0.00012175748779522364, + "loss": 0.0028, + "step": 2970 + }, + { + "epoch": 0.7843912728231495, + "grad_norm": 0.8151025772094727, + "learning_rate": 0.00012162554426705371, + "loss": 0.0429, + "step": 2975 + }, + { + "epoch": 0.7857095774833565, + "grad_norm": 0.6765475273132324, + "learning_rate": 0.00012149360073888377, + "loss": 0.0319, + "step": 2980 + }, + { + "epoch": 0.7870278821435633, + "grad_norm": 0.054469238966703415, + "learning_rate": 0.00012136165721071382, + "loss": 0.0413, + "step": 2985 + }, + { + "epoch": 0.7883461868037703, + "grad_norm": 0.045610666275024414, + "learning_rate": 0.00012122971368254388, + "loss": 0.0521, + "step": 2990 + }, + { + "epoch": 0.7896644914639773, + "grad_norm": 0.4222470223903656, + "learning_rate": 0.00012109777015437393, + "loss": 0.0846, + "step": 2995 + }, + { + "epoch": 0.7909827961241843, + "grad_norm": 0.0272397268563509, + "learning_rate": 0.00012096582662620399, + "loss": 0.0364, + "step": 3000 + }, + { + "epoch": 0.7909827961241843, + "eval_loss": 0.033312585204839706, + "eval_runtime": 452.2552, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 3.728, + "step": 3000 + }, + { + "epoch": 0.7923011007843913, + "grad_norm": 0.08674059063196182, + "learning_rate": 0.00012083388309803406, + "loss": 0.0081, + "step": 3005 + }, + { + "epoch": 0.7936194054445982, + "grad_norm": 0.21960832178592682, + "learning_rate": 0.00012070193956986411, + "loss": 0.0468, + "step": 3010 + }, + { + "epoch": 0.7949377101048052, + "grad_norm": 0.11259289085865021, + "learning_rate": 0.00012056999604169417, + "loss": 0.0124, + "step": 3015 + }, + { + "epoch": 0.7962560147650122, + "grad_norm": 0.02945362776517868, + "learning_rate": 0.00012043805251352422, + "loss": 0.0298, + "step": 3020 + }, + { + "epoch": 0.7975743194252192, + "grad_norm": 0.27889615297317505, + "learning_rate": 0.00012030610898535428, + "loss": 0.0251, + "step": 3025 + }, + { + "epoch": 0.7988926240854262, + "grad_norm": 0.05873241275548935, + "learning_rate": 0.00012017416545718434, + "loss": 0.0132, + "step": 3030 + }, + { + "epoch": 0.8002109287456332, + "grad_norm": 0.1570046991109848, + "learning_rate": 0.00012004222192901439, + "loss": 0.0228, + "step": 3035 + }, + { + "epoch": 0.80152923340584, + "grad_norm": 0.12575332820415497, + "learning_rate": 0.00011991027840084443, + "loss": 0.0049, + "step": 3040 + }, + { + "epoch": 0.802847538066047, + "grad_norm": 0.8416435122489929, + "learning_rate": 0.00011977833487267449, + "loss": 0.0542, + "step": 3045 + }, + { + "epoch": 0.804165842726254, + "grad_norm": 0.2605098485946655, + "learning_rate": 0.00011964639134450454, + "loss": 0.0084, + "step": 3050 + }, + { + "epoch": 0.805484147386461, + "grad_norm": 0.8996294736862183, + "learning_rate": 0.00011951444781633461, + "loss": 0.0442, + "step": 3055 + }, + { + "epoch": 0.806802452046668, + "grad_norm": 2.7525105476379395, + "learning_rate": 0.00011938250428816467, + "loss": 0.0642, + "step": 3060 + }, + { + "epoch": 0.808120756706875, + "grad_norm": 0.14955930411815643, + "learning_rate": 0.00011925056075999473, + "loss": 0.0384, + "step": 3065 + }, + { + "epoch": 0.8094390613670819, + "grad_norm": 0.018756115809082985, + "learning_rate": 0.00011911861723182478, + "loss": 0.0154, + "step": 3070 + }, + { + "epoch": 0.8107573660272889, + "grad_norm": 0.23998615145683289, + "learning_rate": 0.00011898667370365484, + "loss": 0.0413, + "step": 3075 + }, + { + "epoch": 0.8120756706874959, + "grad_norm": 0.27253249287605286, + "learning_rate": 0.00011885473017548489, + "loss": 0.0081, + "step": 3080 + }, + { + "epoch": 0.8133939753477029, + "grad_norm": 0.2925993502140045, + "learning_rate": 0.00011872278664731495, + "loss": 0.0332, + "step": 3085 + }, + { + "epoch": 0.8147122800079099, + "grad_norm": 0.5364832878112793, + "learning_rate": 0.00011859084311914502, + "loss": 0.0143, + "step": 3090 + }, + { + "epoch": 0.8160305846681168, + "grad_norm": 0.32104921340942383, + "learning_rate": 0.00011845889959097507, + "loss": 0.0216, + "step": 3095 + }, + { + "epoch": 0.8173488893283237, + "grad_norm": 0.0205856766551733, + "learning_rate": 0.00011832695606280513, + "loss": 0.0346, + "step": 3100 + }, + { + "epoch": 0.8186671939885307, + "grad_norm": 0.2541547417640686, + "learning_rate": 0.00011819501253463518, + "loss": 0.0793, + "step": 3105 + }, + { + "epoch": 0.8199854986487377, + "grad_norm": 0.08333491533994675, + "learning_rate": 0.00011806306900646524, + "loss": 0.0049, + "step": 3110 + }, + { + "epoch": 0.8213038033089447, + "grad_norm": 0.0355968177318573, + "learning_rate": 0.0001179311254782953, + "loss": 0.0051, + "step": 3115 + }, + { + "epoch": 0.8226221079691517, + "grad_norm": 0.06948401033878326, + "learning_rate": 0.00011779918195012536, + "loss": 0.013, + "step": 3120 + }, + { + "epoch": 0.8239404126293587, + "grad_norm": 0.03328891843557358, + "learning_rate": 0.00011766723842195542, + "loss": 0.0122, + "step": 3125 + }, + { + "epoch": 0.8252587172895656, + "grad_norm": 0.013782350346446037, + "learning_rate": 0.00011753529489378548, + "loss": 0.0073, + "step": 3130 + }, + { + "epoch": 0.8265770219497726, + "grad_norm": 0.024390392005443573, + "learning_rate": 0.00011740335136561553, + "loss": 0.0143, + "step": 3135 + }, + { + "epoch": 0.8278953266099796, + "grad_norm": 0.002548128366470337, + "learning_rate": 0.00011727140783744557, + "loss": 0.0027, + "step": 3140 + }, + { + "epoch": 0.8292136312701865, + "grad_norm": 0.11674848943948746, + "learning_rate": 0.00011713946430927563, + "loss": 0.0253, + "step": 3145 + }, + { + "epoch": 0.8305319359303935, + "grad_norm": 0.005774884019047022, + "learning_rate": 0.00011700752078110568, + "loss": 0.0018, + "step": 3150 + }, + { + "epoch": 0.8318502405906005, + "grad_norm": 0.5763069987297058, + "learning_rate": 0.00011687557725293574, + "loss": 0.0119, + "step": 3155 + }, + { + "epoch": 0.8331685452508074, + "grad_norm": 0.0027607593219727278, + "learning_rate": 0.0001167436337247658, + "loss": 0.0279, + "step": 3160 + }, + { + "epoch": 0.8344868499110144, + "grad_norm": 1.859642505645752, + "learning_rate": 0.00011661169019659585, + "loss": 0.0228, + "step": 3165 + }, + { + "epoch": 0.8358051545712214, + "grad_norm": 0.16597022116184235, + "learning_rate": 0.00011647974666842592, + "loss": 0.1228, + "step": 3170 + }, + { + "epoch": 0.8371234592314284, + "grad_norm": 0.33833742141723633, + "learning_rate": 0.00011634780314025598, + "loss": 0.073, + "step": 3175 + }, + { + "epoch": 0.8384417638916354, + "grad_norm": 0.024682912975549698, + "learning_rate": 0.00011621585961208603, + "loss": 0.0042, + "step": 3180 + }, + { + "epoch": 0.8397600685518424, + "grad_norm": 0.05926942452788353, + "learning_rate": 0.00011608391608391609, + "loss": 0.0066, + "step": 3185 + }, + { + "epoch": 0.8410783732120493, + "grad_norm": 0.1414029747247696, + "learning_rate": 0.00011595197255574614, + "loss": 0.0603, + "step": 3190 + }, + { + "epoch": 0.8423966778722562, + "grad_norm": 0.37928736209869385, + "learning_rate": 0.0001158200290275762, + "loss": 0.0266, + "step": 3195 + }, + { + "epoch": 0.8437149825324632, + "grad_norm": 0.018329354003071785, + "learning_rate": 0.00011568808549940627, + "loss": 0.0047, + "step": 3200 + }, + { + "epoch": 0.8450332871926702, + "grad_norm": 0.2993735373020172, + "learning_rate": 0.00011555614197123632, + "loss": 0.0218, + "step": 3205 + }, + { + "epoch": 0.8463515918528772, + "grad_norm": 0.1767728328704834, + "learning_rate": 0.00011542419844306638, + "loss": 0.0363, + "step": 3210 + }, + { + "epoch": 0.8476698965130842, + "grad_norm": 0.39774414896965027, + "learning_rate": 0.00011529225491489644, + "loss": 0.0506, + "step": 3215 + }, + { + "epoch": 0.8489882011732911, + "grad_norm": 0.021896762773394585, + "learning_rate": 0.00011516031138672649, + "loss": 0.0081, + "step": 3220 + }, + { + "epoch": 0.8503065058334981, + "grad_norm": 0.358372300863266, + "learning_rate": 0.00011502836785855655, + "loss": 0.0224, + "step": 3225 + }, + { + "epoch": 0.8516248104937051, + "grad_norm": 0.01605542004108429, + "learning_rate": 0.00011489642433038662, + "loss": 0.0215, + "step": 3230 + }, + { + "epoch": 0.8529431151539121, + "grad_norm": 0.021189266815781593, + "learning_rate": 0.00011476448080221667, + "loss": 0.0051, + "step": 3235 + }, + { + "epoch": 0.8542614198141191, + "grad_norm": 0.013394076377153397, + "learning_rate": 0.0001146325372740467, + "loss": 0.021, + "step": 3240 + }, + { + "epoch": 0.8555797244743261, + "grad_norm": 0.19848507642745972, + "learning_rate": 0.00011450059374587676, + "loss": 0.0285, + "step": 3245 + }, + { + "epoch": 0.856898029134533, + "grad_norm": 0.2463046759366989, + "learning_rate": 0.00011436865021770683, + "loss": 0.0384, + "step": 3250 + }, + { + "epoch": 0.8582163337947399, + "grad_norm": 0.37432390451431274, + "learning_rate": 0.00011423670668953688, + "loss": 0.0098, + "step": 3255 + }, + { + "epoch": 0.8595346384549469, + "grad_norm": 0.060943394899368286, + "learning_rate": 0.00011410476316136694, + "loss": 0.0087, + "step": 3260 + }, + { + "epoch": 0.8608529431151539, + "grad_norm": 0.2846696674823761, + "learning_rate": 0.00011397281963319699, + "loss": 0.0148, + "step": 3265 + }, + { + "epoch": 0.8621712477753609, + "grad_norm": 0.009311323054134846, + "learning_rate": 0.00011384087610502705, + "loss": 0.0024, + "step": 3270 + }, + { + "epoch": 0.8634895524355679, + "grad_norm": 0.046277035027742386, + "learning_rate": 0.0001137089325768571, + "loss": 0.0274, + "step": 3275 + }, + { + "epoch": 0.8648078570957748, + "grad_norm": 0.006024620030075312, + "learning_rate": 0.00011357698904868716, + "loss": 0.0286, + "step": 3280 + }, + { + "epoch": 0.8661261617559818, + "grad_norm": 0.033578380942344666, + "learning_rate": 0.00011344504552051723, + "loss": 0.0153, + "step": 3285 + }, + { + "epoch": 0.8674444664161888, + "grad_norm": 0.8537917137145996, + "learning_rate": 0.00011331310199234728, + "loss": 0.0304, + "step": 3290 + }, + { + "epoch": 0.8687627710763958, + "grad_norm": 0.013933337293565273, + "learning_rate": 0.00011318115846417734, + "loss": 0.0112, + "step": 3295 + }, + { + "epoch": 0.8700810757366028, + "grad_norm": 0.35437721014022827, + "learning_rate": 0.0001130492149360074, + "loss": 0.0228, + "step": 3300 + }, + { + "epoch": 0.8713993803968098, + "grad_norm": 1.3024121522903442, + "learning_rate": 0.00011291727140783745, + "loss": 0.0203, + "step": 3305 + }, + { + "epoch": 0.8727176850570166, + "grad_norm": 0.5131255984306335, + "learning_rate": 0.00011278532787966751, + "loss": 0.0181, + "step": 3310 + }, + { + "epoch": 0.8740359897172236, + "grad_norm": 0.039366886019706726, + "learning_rate": 0.00011265338435149758, + "loss": 0.0192, + "step": 3315 + }, + { + "epoch": 0.8753542943774306, + "grad_norm": 0.13679669797420502, + "learning_rate": 0.00011252144082332763, + "loss": 0.004, + "step": 3320 + }, + { + "epoch": 0.8766725990376376, + "grad_norm": 0.003076886525377631, + "learning_rate": 0.00011238949729515769, + "loss": 0.0405, + "step": 3325 + }, + { + "epoch": 0.8779909036978446, + "grad_norm": 0.019953785464167595, + "learning_rate": 0.00011225755376698774, + "loss": 0.0241, + "step": 3330 + }, + { + "epoch": 0.8793092083580516, + "grad_norm": 0.007980377413332462, + "learning_rate": 0.0001121256102388178, + "loss": 0.0064, + "step": 3335 + }, + { + "epoch": 0.8806275130182585, + "grad_norm": 0.018761295825242996, + "learning_rate": 0.00011199366671064784, + "loss": 0.0032, + "step": 3340 + }, + { + "epoch": 0.8819458176784655, + "grad_norm": 0.022511709481477737, + "learning_rate": 0.0001118617231824779, + "loss": 0.0055, + "step": 3345 + }, + { + "epoch": 0.8832641223386725, + "grad_norm": 0.021270718425512314, + "learning_rate": 0.00011172977965430795, + "loss": 0.033, + "step": 3350 + }, + { + "epoch": 0.8845824269988795, + "grad_norm": 0.02710561640560627, + "learning_rate": 0.00011159783612613801, + "loss": 0.0094, + "step": 3355 + }, + { + "epoch": 0.8859007316590864, + "grad_norm": 0.4353378117084503, + "learning_rate": 0.00011146589259796806, + "loss": 0.0089, + "step": 3360 + }, + { + "epoch": 0.8872190363192934, + "grad_norm": 0.0257766991853714, + "learning_rate": 0.00011133394906979813, + "loss": 0.0059, + "step": 3365 + }, + { + "epoch": 0.8885373409795003, + "grad_norm": 0.80838942527771, + "learning_rate": 0.00011120200554162819, + "loss": 0.0263, + "step": 3370 + }, + { + "epoch": 0.8898556456397073, + "grad_norm": 0.007799761835485697, + "learning_rate": 0.00011107006201345824, + "loss": 0.0028, + "step": 3375 + }, + { + "epoch": 0.8911739502999143, + "grad_norm": 0.007315775845199823, + "learning_rate": 0.0001109381184852883, + "loss": 0.0127, + "step": 3380 + }, + { + "epoch": 0.8924922549601213, + "grad_norm": 1.4861233234405518, + "learning_rate": 0.00011080617495711836, + "loss": 0.0562, + "step": 3385 + }, + { + "epoch": 0.8938105596203283, + "grad_norm": 0.010219530202448368, + "learning_rate": 0.00011067423142894841, + "loss": 0.0438, + "step": 3390 + }, + { + "epoch": 0.8951288642805353, + "grad_norm": 1.0191857814788818, + "learning_rate": 0.00011054228790077848, + "loss": 0.0493, + "step": 3395 + }, + { + "epoch": 0.8964471689407422, + "grad_norm": 0.01459536887705326, + "learning_rate": 0.00011041034437260854, + "loss": 0.0117, + "step": 3400 + }, + { + "epoch": 0.8977654736009492, + "grad_norm": 0.008682495914399624, + "learning_rate": 0.00011027840084443859, + "loss": 0.02, + "step": 3405 + }, + { + "epoch": 0.8990837782611562, + "grad_norm": 0.02197263017296791, + "learning_rate": 0.00011014645731626865, + "loss": 0.0454, + "step": 3410 + }, + { + "epoch": 0.9004020829213631, + "grad_norm": 0.01436714269220829, + "learning_rate": 0.0001100145137880987, + "loss": 0.0283, + "step": 3415 + }, + { + "epoch": 0.9017203875815701, + "grad_norm": 0.14327946305274963, + "learning_rate": 0.00010988257025992876, + "loss": 0.0461, + "step": 3420 + }, + { + "epoch": 0.9030386922417771, + "grad_norm": 1.671773910522461, + "learning_rate": 0.00010975062673175883, + "loss": 0.054, + "step": 3425 + }, + { + "epoch": 0.904356996901984, + "grad_norm": 0.009926804341375828, + "learning_rate": 0.00010961868320358888, + "loss": 0.0429, + "step": 3430 + }, + { + "epoch": 0.905675301562191, + "grad_norm": 0.554020881652832, + "learning_rate": 0.00010948673967541894, + "loss": 0.0618, + "step": 3435 + }, + { + "epoch": 0.906993606222398, + "grad_norm": 0.1399248093366623, + "learning_rate": 0.00010935479614724897, + "loss": 0.0229, + "step": 3440 + }, + { + "epoch": 0.908311910882605, + "grad_norm": 0.02739197015762329, + "learning_rate": 0.00010922285261907904, + "loss": 0.0082, + "step": 3445 + }, + { + "epoch": 0.909630215542812, + "grad_norm": 0.33394527435302734, + "learning_rate": 0.00010909090909090909, + "loss": 0.0403, + "step": 3450 + }, + { + "epoch": 0.9109485202030189, + "grad_norm": 0.08083894103765488, + "learning_rate": 0.00010895896556273915, + "loss": 0.0406, + "step": 3455 + }, + { + "epoch": 0.9122668248632259, + "grad_norm": 0.39336663484573364, + "learning_rate": 0.0001088270220345692, + "loss": 0.02, + "step": 3460 + }, + { + "epoch": 0.9135851295234328, + "grad_norm": 0.20481553673744202, + "learning_rate": 0.00010869507850639926, + "loss": 0.0221, + "step": 3465 + }, + { + "epoch": 0.9149034341836398, + "grad_norm": 1.4507408142089844, + "learning_rate": 0.00010856313497822932, + "loss": 0.0357, + "step": 3470 + }, + { + "epoch": 0.9162217388438468, + "grad_norm": 0.2678806483745575, + "learning_rate": 0.00010843119145005937, + "loss": 0.0181, + "step": 3475 + }, + { + "epoch": 0.9175400435040538, + "grad_norm": 0.007361674215644598, + "learning_rate": 0.00010829924792188944, + "loss": 0.0978, + "step": 3480 + }, + { + "epoch": 0.9188583481642607, + "grad_norm": 0.773695707321167, + "learning_rate": 0.0001081673043937195, + "loss": 0.0401, + "step": 3485 + }, + { + "epoch": 0.9201766528244677, + "grad_norm": 0.0010772625682875514, + "learning_rate": 0.00010803536086554955, + "loss": 0.0233, + "step": 3490 + }, + { + "epoch": 0.9214949574846747, + "grad_norm": 0.08971104770898819, + "learning_rate": 0.00010790341733737961, + "loss": 0.0319, + "step": 3495 + }, + { + "epoch": 0.9228132621448817, + "grad_norm": 0.21372731029987335, + "learning_rate": 0.00010777147380920966, + "loss": 0.0315, + "step": 3500 + }, + { + "epoch": 0.9228132621448817, + "eval_loss": 0.02952708676457405, + "eval_runtime": 451.5837, + "eval_samples_per_second": 7.467, + "eval_steps_per_second": 3.734, + "step": 3500 + }, + { + "epoch": 0.9241315668050887, + "grad_norm": 0.016639264300465584, + "learning_rate": 0.00010763953028103972, + "loss": 0.0125, + "step": 3505 + }, + { + "epoch": 0.9254498714652957, + "grad_norm": 0.46340492367744446, + "learning_rate": 0.00010750758675286979, + "loss": 0.0186, + "step": 3510 + }, + { + "epoch": 0.9267681761255026, + "grad_norm": 0.01847526989877224, + "learning_rate": 0.00010737564322469984, + "loss": 0.0026, + "step": 3515 + }, + { + "epoch": 0.9280864807857095, + "grad_norm": 0.5947860479354858, + "learning_rate": 0.0001072436996965299, + "loss": 0.0259, + "step": 3520 + }, + { + "epoch": 0.9294047854459165, + "grad_norm": 0.06145291402935982, + "learning_rate": 0.00010711175616835995, + "loss": 0.0057, + "step": 3525 + }, + { + "epoch": 0.9307230901061235, + "grad_norm": 0.0143959429115057, + "learning_rate": 0.00010697981264019001, + "loss": 0.0145, + "step": 3530 + }, + { + "epoch": 0.9320413947663305, + "grad_norm": 0.21143831312656403, + "learning_rate": 0.00010684786911202007, + "loss": 0.0459, + "step": 3535 + }, + { + "epoch": 0.9333596994265375, + "grad_norm": 0.02548077143728733, + "learning_rate": 0.00010671592558385011, + "loss": 0.0051, + "step": 3540 + }, + { + "epoch": 0.9346780040867444, + "grad_norm": 0.008077048696577549, + "learning_rate": 0.00010658398205568016, + "loss": 0.0306, + "step": 3545 + }, + { + "epoch": 0.9359963087469514, + "grad_norm": 0.0030760422814637423, + "learning_rate": 0.00010645203852751022, + "loss": 0.0575, + "step": 3550 + }, + { + "epoch": 0.9373146134071584, + "grad_norm": 0.18114158511161804, + "learning_rate": 0.00010632009499934027, + "loss": 0.0885, + "step": 3555 + }, + { + "epoch": 0.9386329180673654, + "grad_norm": 0.02450549602508545, + "learning_rate": 0.00010618815147117034, + "loss": 0.0045, + "step": 3560 + }, + { + "epoch": 0.9399512227275724, + "grad_norm": 0.1238626018166542, + "learning_rate": 0.0001060562079430004, + "loss": 0.0166, + "step": 3565 + }, + { + "epoch": 0.9412695273877794, + "grad_norm": 0.1879919469356537, + "learning_rate": 0.00010592426441483046, + "loss": 0.0077, + "step": 3570 + }, + { + "epoch": 0.9425878320479862, + "grad_norm": 0.11323565989732742, + "learning_rate": 0.00010579232088666051, + "loss": 0.0213, + "step": 3575 + }, + { + "epoch": 0.9439061367081932, + "grad_norm": 0.35575854778289795, + "learning_rate": 0.00010566037735849057, + "loss": 0.0336, + "step": 3580 + }, + { + "epoch": 0.9452244413684002, + "grad_norm": 0.14052227139472961, + "learning_rate": 0.00010552843383032062, + "loss": 0.0325, + "step": 3585 + }, + { + "epoch": 0.9465427460286072, + "grad_norm": 0.2643798887729645, + "learning_rate": 0.00010539649030215069, + "loss": 0.0192, + "step": 3590 + }, + { + "epoch": 0.9478610506888142, + "grad_norm": 0.3207031190395355, + "learning_rate": 0.00010526454677398075, + "loss": 0.0221, + "step": 3595 + }, + { + "epoch": 0.9491793553490212, + "grad_norm": 0.022803861647844315, + "learning_rate": 0.0001051326032458108, + "loss": 0.029, + "step": 3600 + }, + { + "epoch": 0.9504976600092281, + "grad_norm": 0.02511664852499962, + "learning_rate": 0.00010500065971764086, + "loss": 0.0422, + "step": 3605 + }, + { + "epoch": 0.9518159646694351, + "grad_norm": 0.06505445390939713, + "learning_rate": 0.00010486871618947091, + "loss": 0.0092, + "step": 3610 + }, + { + "epoch": 0.9531342693296421, + "grad_norm": 0.09998584538698196, + "learning_rate": 0.00010473677266130097, + "loss": 0.0242, + "step": 3615 + }, + { + "epoch": 0.9544525739898491, + "grad_norm": 0.9645698666572571, + "learning_rate": 0.00010460482913313104, + "loss": 0.0124, + "step": 3620 + }, + { + "epoch": 0.955770878650056, + "grad_norm": 0.2389964610338211, + "learning_rate": 0.0001044728856049611, + "loss": 0.0169, + "step": 3625 + }, + { + "epoch": 0.957089183310263, + "grad_norm": 2.030608654022217, + "learning_rate": 0.00010434094207679115, + "loss": 0.0518, + "step": 3630 + }, + { + "epoch": 0.9584074879704699, + "grad_norm": 0.05979987606406212, + "learning_rate": 0.0001042089985486212, + "loss": 0.0081, + "step": 3635 + }, + { + "epoch": 0.9597257926306769, + "grad_norm": 0.15761719644069672, + "learning_rate": 0.00010407705502045125, + "loss": 0.0061, + "step": 3640 + }, + { + "epoch": 0.9610440972908839, + "grad_norm": 0.6534290909767151, + "learning_rate": 0.0001039451114922813, + "loss": 0.0104, + "step": 3645 + }, + { + "epoch": 0.9623624019510909, + "grad_norm": 1.0324147939682007, + "learning_rate": 0.00010381316796411136, + "loss": 0.0381, + "step": 3650 + }, + { + "epoch": 0.9636807066112979, + "grad_norm": 0.002968872431665659, + "learning_rate": 0.00010368122443594142, + "loss": 0.0343, + "step": 3655 + }, + { + "epoch": 0.9649990112715049, + "grad_norm": 0.011243184097111225, + "learning_rate": 0.00010354928090777147, + "loss": 0.019, + "step": 3660 + }, + { + "epoch": 0.9663173159317118, + "grad_norm": 0.17663739621639252, + "learning_rate": 0.00010341733737960153, + "loss": 0.0452, + "step": 3665 + }, + { + "epoch": 0.9676356205919188, + "grad_norm": 1.2647719383239746, + "learning_rate": 0.00010328539385143158, + "loss": 0.0154, + "step": 3670 + }, + { + "epoch": 0.9689539252521258, + "grad_norm": 0.3691752552986145, + "learning_rate": 0.00010315345032326165, + "loss": 0.028, + "step": 3675 + }, + { + "epoch": 0.9702722299123328, + "grad_norm": 0.0015879774000495672, + "learning_rate": 0.00010302150679509171, + "loss": 0.0202, + "step": 3680 + }, + { + "epoch": 0.9715905345725397, + "grad_norm": 0.1441984623670578, + "learning_rate": 0.00010288956326692176, + "loss": 0.0221, + "step": 3685 + }, + { + "epoch": 0.9729088392327467, + "grad_norm": 0.20431455969810486, + "learning_rate": 0.00010275761973875182, + "loss": 0.0072, + "step": 3690 + }, + { + "epoch": 0.9742271438929536, + "grad_norm": 0.861625611782074, + "learning_rate": 0.00010262567621058187, + "loss": 0.0523, + "step": 3695 + }, + { + "epoch": 0.9755454485531606, + "grad_norm": 0.005049478262662888, + "learning_rate": 0.00010249373268241193, + "loss": 0.0051, + "step": 3700 + }, + { + "epoch": 0.9768637532133676, + "grad_norm": 0.49685510993003845, + "learning_rate": 0.000102361789154242, + "loss": 0.023, + "step": 3705 + }, + { + "epoch": 0.9781820578735746, + "grad_norm": 0.08789395540952682, + "learning_rate": 0.00010222984562607205, + "loss": 0.0159, + "step": 3710 + }, + { + "epoch": 0.9795003625337816, + "grad_norm": 0.027168691158294678, + "learning_rate": 0.00010209790209790211, + "loss": 0.0083, + "step": 3715 + }, + { + "epoch": 0.9808186671939886, + "grad_norm": 0.0006773864733986557, + "learning_rate": 0.00010196595856973217, + "loss": 0.0048, + "step": 3720 + }, + { + "epoch": 0.9821369718541955, + "grad_norm": 0.01636457070708275, + "learning_rate": 0.00010183401504156222, + "loss": 0.0159, + "step": 3725 + }, + { + "epoch": 0.9834552765144025, + "grad_norm": 0.10160859674215317, + "learning_rate": 0.00010170207151339228, + "loss": 0.0047, + "step": 3730 + }, + { + "epoch": 0.9847735811746094, + "grad_norm": 0.14173269271850586, + "learning_rate": 0.00010157012798522232, + "loss": 0.006, + "step": 3735 + }, + { + "epoch": 0.9860918858348164, + "grad_norm": 0.003458512481302023, + "learning_rate": 0.00010143818445705238, + "loss": 0.0193, + "step": 3740 + }, + { + "epoch": 0.9874101904950234, + "grad_norm": 0.005163820460438728, + "learning_rate": 0.00010130624092888243, + "loss": 0.0039, + "step": 3745 + }, + { + "epoch": 0.9887284951552304, + "grad_norm": 0.005913791712373495, + "learning_rate": 0.00010117429740071249, + "loss": 0.0119, + "step": 3750 + }, + { + "epoch": 0.9900467998154373, + "grad_norm": 0.00800853967666626, + "learning_rate": 0.00010104235387254256, + "loss": 0.044, + "step": 3755 + }, + { + "epoch": 0.9913651044756443, + "grad_norm": 0.18146778643131256, + "learning_rate": 0.00010091041034437261, + "loss": 0.0048, + "step": 3760 + }, + { + "epoch": 0.9926834091358513, + "grad_norm": 0.01235104724764824, + "learning_rate": 0.00010077846681620267, + "loss": 0.0017, + "step": 3765 + }, + { + "epoch": 0.9940017137960583, + "grad_norm": 0.17677897214889526, + "learning_rate": 0.00010064652328803272, + "loss": 0.0339, + "step": 3770 + }, + { + "epoch": 0.9953200184562653, + "grad_norm": 0.0017472271574661136, + "learning_rate": 0.00010051457975986278, + "loss": 0.0494, + "step": 3775 + }, + { + "epoch": 0.9966383231164723, + "grad_norm": 0.10814860463142395, + "learning_rate": 0.00010038263623169283, + "loss": 0.0741, + "step": 3780 + }, + { + "epoch": 0.9979566277766792, + "grad_norm": 0.11329760402441025, + "learning_rate": 0.0001002506927035229, + "loss": 0.0182, + "step": 3785 + }, + { + "epoch": 0.9992749324368861, + "grad_norm": 0.11573276668787003, + "learning_rate": 0.00010011874917535296, + "loss": 0.0068, + "step": 3790 + }, + { + "epoch": 1.000790982796124, + "grad_norm": 0.08449886739253998, + "learning_rate": 9.998680564718301e-05, + "loss": 0.0141, + "step": 3795 + }, + { + "epoch": 1.002109287456331, + "grad_norm": 0.05035184696316719, + "learning_rate": 9.985486211901307e-05, + "loss": 0.0293, + "step": 3800 + }, + { + "epoch": 1.003427592116538, + "grad_norm": 0.0255444198846817, + "learning_rate": 9.972291859084313e-05, + "loss": 0.0054, + "step": 3805 + }, + { + "epoch": 1.004745896776745, + "grad_norm": 0.0033677336759865284, + "learning_rate": 9.959097506267318e-05, + "loss": 0.0567, + "step": 3810 + }, + { + "epoch": 1.006064201436952, + "grad_norm": 0.09453682601451874, + "learning_rate": 9.945903153450324e-05, + "loss": 0.0589, + "step": 3815 + }, + { + "epoch": 1.007382506097159, + "grad_norm": 0.01592979207634926, + "learning_rate": 9.932708800633329e-05, + "loss": 0.0043, + "step": 3820 + }, + { + "epoch": 1.008700810757366, + "grad_norm": 0.002263693604618311, + "learning_rate": 9.919514447816335e-05, + "loss": 0.0195, + "step": 3825 + }, + { + "epoch": 1.010019115417573, + "grad_norm": 0.013390793465077877, + "learning_rate": 9.90632009499934e-05, + "loss": 0.0152, + "step": 3830 + }, + { + "epoch": 1.01133742007778, + "grad_norm": 0.10473847389221191, + "learning_rate": 9.893125742182346e-05, + "loss": 0.0606, + "step": 3835 + }, + { + "epoch": 1.012655724737987, + "grad_norm": 0.05837221071124077, + "learning_rate": 9.879931389365353e-05, + "loss": 0.0121, + "step": 3840 + }, + { + "epoch": 1.013974029398194, + "grad_norm": 0.3803791105747223, + "learning_rate": 9.866737036548358e-05, + "loss": 0.0386, + "step": 3845 + }, + { + "epoch": 1.0152923340584008, + "grad_norm": 0.4067519009113312, + "learning_rate": 9.853542683731364e-05, + "loss": 0.0115, + "step": 3850 + }, + { + "epoch": 1.0166106387186078, + "grad_norm": 0.02585229091346264, + "learning_rate": 9.84034833091437e-05, + "loss": 0.0214, + "step": 3855 + }, + { + "epoch": 1.0179289433788148, + "grad_norm": 0.03670825809240341, + "learning_rate": 9.827153978097374e-05, + "loss": 0.0059, + "step": 3860 + }, + { + "epoch": 1.0192472480390218, + "grad_norm": 0.014171554706990719, + "learning_rate": 9.81395962528038e-05, + "loss": 0.0145, + "step": 3865 + }, + { + "epoch": 1.0205655526992288, + "grad_norm": 0.027376385405659676, + "learning_rate": 9.800765272463386e-05, + "loss": 0.0089, + "step": 3870 + }, + { + "epoch": 1.0218838573594358, + "grad_norm": 0.03168405964970589, + "learning_rate": 9.787570919646392e-05, + "loss": 0.0132, + "step": 3875 + }, + { + "epoch": 1.0232021620196428, + "grad_norm": 0.03346199914813042, + "learning_rate": 9.774376566829397e-05, + "loss": 0.0246, + "step": 3880 + }, + { + "epoch": 1.0245204666798498, + "grad_norm": 0.00894144270569086, + "learning_rate": 9.761182214012403e-05, + "loss": 0.0105, + "step": 3885 + }, + { + "epoch": 1.0258387713400567, + "grad_norm": 0.3172806203365326, + "learning_rate": 9.747987861195409e-05, + "loss": 0.0103, + "step": 3890 + }, + { + "epoch": 1.0271570760002637, + "grad_norm": 0.009055040776729584, + "learning_rate": 9.734793508378414e-05, + "loss": 0.0103, + "step": 3895 + }, + { + "epoch": 1.0284753806604707, + "grad_norm": 0.014140011742711067, + "learning_rate": 9.721599155561421e-05, + "loss": 0.0037, + "step": 3900 + }, + { + "epoch": 1.0297936853206777, + "grad_norm": 0.008317383006215096, + "learning_rate": 9.708404802744427e-05, + "loss": 0.002, + "step": 3905 + }, + { + "epoch": 1.0311119899808845, + "grad_norm": 0.005038558971136808, + "learning_rate": 9.695210449927431e-05, + "loss": 0.0017, + "step": 3910 + }, + { + "epoch": 1.0324302946410915, + "grad_norm": 0.40058520436286926, + "learning_rate": 9.682016097110436e-05, + "loss": 0.0065, + "step": 3915 + }, + { + "epoch": 1.0337485993012985, + "grad_norm": 0.005197151098400354, + "learning_rate": 9.668821744293442e-05, + "loss": 0.0031, + "step": 3920 + }, + { + "epoch": 1.0350669039615055, + "grad_norm": 0.014353781007230282, + "learning_rate": 9.655627391476449e-05, + "loss": 0.0009, + "step": 3925 + }, + { + "epoch": 1.0363852086217125, + "grad_norm": 0.13260559737682343, + "learning_rate": 9.642433038659454e-05, + "loss": 0.0323, + "step": 3930 + }, + { + "epoch": 1.0377035132819195, + "grad_norm": 0.006795065477490425, + "learning_rate": 9.62923868584246e-05, + "loss": 0.0022, + "step": 3935 + }, + { + "epoch": 1.0390218179421264, + "grad_norm": 0.2276086062192917, + "learning_rate": 9.616044333025466e-05, + "loss": 0.0221, + "step": 3940 + }, + { + "epoch": 1.0403401226023334, + "grad_norm": 0.06121920794248581, + "learning_rate": 9.602849980208471e-05, + "loss": 0.0037, + "step": 3945 + }, + { + "epoch": 1.0416584272625404, + "grad_norm": 0.9180755019187927, + "learning_rate": 9.589655627391477e-05, + "loss": 0.0589, + "step": 3950 + }, + { + "epoch": 1.0429767319227474, + "grad_norm": 0.07515591382980347, + "learning_rate": 9.576461274574484e-05, + "loss": 0.0653, + "step": 3955 + }, + { + "epoch": 1.0442950365829544, + "grad_norm": 0.018060607835650444, + "learning_rate": 9.563266921757488e-05, + "loss": 0.0178, + "step": 3960 + }, + { + "epoch": 1.0456133412431612, + "grad_norm": 0.02751368284225464, + "learning_rate": 9.550072568940493e-05, + "loss": 0.0076, + "step": 3965 + }, + { + "epoch": 1.0469316459033682, + "grad_norm": 0.653998613357544, + "learning_rate": 9.536878216123499e-05, + "loss": 0.0066, + "step": 3970 + }, + { + "epoch": 1.0482499505635752, + "grad_norm": 0.3117768168449402, + "learning_rate": 9.523683863306505e-05, + "loss": 0.0087, + "step": 3975 + }, + { + "epoch": 1.0495682552237822, + "grad_norm": 0.013952831737697124, + "learning_rate": 9.510489510489511e-05, + "loss": 0.0037, + "step": 3980 + }, + { + "epoch": 1.0508865598839892, + "grad_norm": 0.01806250400841236, + "learning_rate": 9.497295157672517e-05, + "loss": 0.0028, + "step": 3985 + }, + { + "epoch": 1.0522048645441962, + "grad_norm": 0.13678006827831268, + "learning_rate": 9.484100804855523e-05, + "loss": 0.0533, + "step": 3990 + }, + { + "epoch": 1.0535231692044031, + "grad_norm": 0.14869382977485657, + "learning_rate": 9.470906452038528e-05, + "loss": 0.009, + "step": 3995 + }, + { + "epoch": 1.0548414738646101, + "grad_norm": 0.33614659309387207, + "learning_rate": 9.457712099221534e-05, + "loss": 0.0555, + "step": 4000 + }, + { + "epoch": 1.0548414738646101, + "eval_loss": 0.026165226474404335, + "eval_runtime": 452.2482, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 3.728, + "step": 4000 + }, + { + "epoch": 1.0561597785248171, + "grad_norm": 0.007546027656644583, + "learning_rate": 9.444517746404539e-05, + "loss": 0.0029, + "step": 4005 + }, + { + "epoch": 1.0574780831850241, + "grad_norm": 0.3720332384109497, + "learning_rate": 9.431323393587545e-05, + "loss": 0.0353, + "step": 4010 + }, + { + "epoch": 1.0587963878452311, + "grad_norm": 1.1335264444351196, + "learning_rate": 9.41812904077055e-05, + "loss": 0.0142, + "step": 4015 + }, + { + "epoch": 1.060114692505438, + "grad_norm": 0.024723488837480545, + "learning_rate": 9.404934687953556e-05, + "loss": 0.006, + "step": 4020 + }, + { + "epoch": 1.0614329971656449, + "grad_norm": 0.040354058146476746, + "learning_rate": 9.391740335136562e-05, + "loss": 0.0107, + "step": 4025 + }, + { + "epoch": 1.0627513018258519, + "grad_norm": 0.222810298204422, + "learning_rate": 9.378545982319567e-05, + "loss": 0.0273, + "step": 4030 + }, + { + "epoch": 1.0640696064860589, + "grad_norm": 0.025684095919132233, + "learning_rate": 9.365351629502574e-05, + "loss": 0.0033, + "step": 4035 + }, + { + "epoch": 1.0653879111462659, + "grad_norm": 0.05338352546095848, + "learning_rate": 9.35215727668558e-05, + "loss": 0.0052, + "step": 4040 + }, + { + "epoch": 1.0667062158064728, + "grad_norm": 0.06182330474257469, + "learning_rate": 9.338962923868585e-05, + "loss": 0.0038, + "step": 4045 + }, + { + "epoch": 1.0680245204666798, + "grad_norm": 0.012170832604169846, + "learning_rate": 9.325768571051591e-05, + "loss": 0.0018, + "step": 4050 + }, + { + "epoch": 1.0693428251268868, + "grad_norm": 0.5424306392669678, + "learning_rate": 9.312574218234596e-05, + "loss": 0.0445, + "step": 4055 + }, + { + "epoch": 1.0706611297870938, + "grad_norm": 0.017939254641532898, + "learning_rate": 9.299379865417602e-05, + "loss": 0.0389, + "step": 4060 + }, + { + "epoch": 1.0719794344473008, + "grad_norm": 0.0060431682504713535, + "learning_rate": 9.286185512600607e-05, + "loss": 0.0025, + "step": 4065 + }, + { + "epoch": 1.0732977391075078, + "grad_norm": 0.0071444883942604065, + "learning_rate": 9.272991159783613e-05, + "loss": 0.0333, + "step": 4070 + }, + { + "epoch": 1.0746160437677148, + "grad_norm": 0.29632750153541565, + "learning_rate": 9.259796806966619e-05, + "loss": 0.0151, + "step": 4075 + }, + { + "epoch": 1.0759343484279218, + "grad_norm": 0.004526323173195124, + "learning_rate": 9.246602454149624e-05, + "loss": 0.006, + "step": 4080 + }, + { + "epoch": 1.0772526530881286, + "grad_norm": 0.023945212364196777, + "learning_rate": 9.23340810133263e-05, + "loss": 0.004, + "step": 4085 + }, + { + "epoch": 1.0785709577483356, + "grad_norm": 0.13235126435756683, + "learning_rate": 9.220213748515635e-05, + "loss": 0.0059, + "step": 4090 + }, + { + "epoch": 1.0798892624085425, + "grad_norm": 0.17592330276966095, + "learning_rate": 9.207019395698642e-05, + "loss": 0.0302, + "step": 4095 + }, + { + "epoch": 1.0812075670687495, + "grad_norm": 0.004582866560667753, + "learning_rate": 9.193825042881648e-05, + "loss": 0.009, + "step": 4100 + }, + { + "epoch": 1.0825258717289565, + "grad_norm": 0.15214525163173676, + "learning_rate": 9.180630690064653e-05, + "loss": 0.0062, + "step": 4105 + }, + { + "epoch": 1.0838441763891635, + "grad_norm": 0.16535983979701996, + "learning_rate": 9.167436337247658e-05, + "loss": 0.0926, + "step": 4110 + }, + { + "epoch": 1.0851624810493705, + "grad_norm": 0.013285227119922638, + "learning_rate": 9.154241984430663e-05, + "loss": 0.0043, + "step": 4115 + }, + { + "epoch": 1.0864807857095775, + "grad_norm": 0.012116984464228153, + "learning_rate": 9.14104763161367e-05, + "loss": 0.0037, + "step": 4120 + }, + { + "epoch": 1.0877990903697845, + "grad_norm": 0.0373845212161541, + "learning_rate": 9.127853278796676e-05, + "loss": 0.0081, + "step": 4125 + }, + { + "epoch": 1.0891173950299915, + "grad_norm": 0.09324615448713303, + "learning_rate": 9.114658925979681e-05, + "loss": 0.0534, + "step": 4130 + }, + { + "epoch": 1.0904356996901985, + "grad_norm": 0.010992968454957008, + "learning_rate": 9.101464573162687e-05, + "loss": 0.0025, + "step": 4135 + }, + { + "epoch": 1.0917540043504055, + "grad_norm": 0.13710318505764008, + "learning_rate": 9.088270220345692e-05, + "loss": 0.0555, + "step": 4140 + }, + { + "epoch": 1.0930723090106123, + "grad_norm": 0.010403074324131012, + "learning_rate": 9.075075867528698e-05, + "loss": 0.0042, + "step": 4145 + }, + { + "epoch": 1.0943906136708192, + "grad_norm": 0.21544460952281952, + "learning_rate": 9.061881514711705e-05, + "loss": 0.0144, + "step": 4150 + }, + { + "epoch": 1.0957089183310262, + "grad_norm": 0.04194799065589905, + "learning_rate": 9.04868716189471e-05, + "loss": 0.0106, + "step": 4155 + }, + { + "epoch": 1.0970272229912332, + "grad_norm": 0.029204202815890312, + "learning_rate": 9.035492809077715e-05, + "loss": 0.0085, + "step": 4160 + }, + { + "epoch": 1.0983455276514402, + "grad_norm": 0.006751026958227158, + "learning_rate": 9.02229845626072e-05, + "loss": 0.0049, + "step": 4165 + }, + { + "epoch": 1.0996638323116472, + "grad_norm": 0.008232722990214825, + "learning_rate": 9.009104103443726e-05, + "loss": 0.0172, + "step": 4170 + }, + { + "epoch": 1.1009821369718542, + "grad_norm": 0.05630079656839371, + "learning_rate": 8.995909750626733e-05, + "loss": 0.0112, + "step": 4175 + }, + { + "epoch": 1.1023004416320612, + "grad_norm": 0.0011601662263274193, + "learning_rate": 8.982715397809738e-05, + "loss": 0.0317, + "step": 4180 + }, + { + "epoch": 1.1036187462922682, + "grad_norm": 0.006554402410984039, + "learning_rate": 8.969521044992744e-05, + "loss": 0.0035, + "step": 4185 + }, + { + "epoch": 1.1049370509524752, + "grad_norm": 0.34513652324676514, + "learning_rate": 8.956326692175749e-05, + "loss": 0.0036, + "step": 4190 + }, + { + "epoch": 1.1062553556126822, + "grad_norm": 0.283669650554657, + "learning_rate": 8.943132339358755e-05, + "loss": 0.0182, + "step": 4195 + }, + { + "epoch": 1.1075736602728892, + "grad_norm": 0.5376952290534973, + "learning_rate": 8.92993798654176e-05, + "loss": 0.0293, + "step": 4200 + }, + { + "epoch": 1.108891964933096, + "grad_norm": 0.01689724065363407, + "learning_rate": 8.916743633724767e-05, + "loss": 0.0206, + "step": 4205 + }, + { + "epoch": 1.110210269593303, + "grad_norm": 0.026538770645856857, + "learning_rate": 8.903549280907772e-05, + "loss": 0.0181, + "step": 4210 + }, + { + "epoch": 1.11152857425351, + "grad_norm": 0.6372873783111572, + "learning_rate": 8.890354928090777e-05, + "loss": 0.021, + "step": 4215 + }, + { + "epoch": 1.112846878913717, + "grad_norm": 0.06177428737282753, + "learning_rate": 8.877160575273783e-05, + "loss": 0.0033, + "step": 4220 + }, + { + "epoch": 1.114165183573924, + "grad_norm": 0.3712109923362732, + "learning_rate": 8.863966222456788e-05, + "loss": 0.0075, + "step": 4225 + }, + { + "epoch": 1.115483488234131, + "grad_norm": 0.030514653772115707, + "learning_rate": 8.850771869639795e-05, + "loss": 0.0183, + "step": 4230 + }, + { + "epoch": 1.116801792894338, + "grad_norm": 0.012861707247793674, + "learning_rate": 8.837577516822801e-05, + "loss": 0.0032, + "step": 4235 + }, + { + "epoch": 1.118120097554545, + "grad_norm": 0.3278522789478302, + "learning_rate": 8.824383164005806e-05, + "loss": 0.0058, + "step": 4240 + }, + { + "epoch": 1.1194384022147519, + "grad_norm": 0.580259382724762, + "learning_rate": 8.811188811188812e-05, + "loss": 0.0068, + "step": 4245 + }, + { + "epoch": 1.1207567068749589, + "grad_norm": 0.007002575788646936, + "learning_rate": 8.797994458371817e-05, + "loss": 0.0063, + "step": 4250 + }, + { + "epoch": 1.1220750115351659, + "grad_norm": 0.22484643757343292, + "learning_rate": 8.784800105554823e-05, + "loss": 0.0167, + "step": 4255 + }, + { + "epoch": 1.1233933161953726, + "grad_norm": 0.004122686106711626, + "learning_rate": 8.771605752737829e-05, + "loss": 0.002, + "step": 4260 + }, + { + "epoch": 1.1247116208555796, + "grad_norm": 0.009832561016082764, + "learning_rate": 8.758411399920834e-05, + "loss": 0.0029, + "step": 4265 + }, + { + "epoch": 1.1260299255157866, + "grad_norm": 0.04854527860879898, + "learning_rate": 8.74521704710384e-05, + "loss": 0.0068, + "step": 4270 + }, + { + "epoch": 1.1273482301759936, + "grad_norm": 0.12221235036849976, + "learning_rate": 8.732022694286845e-05, + "loss": 0.003, + "step": 4275 + }, + { + "epoch": 1.1286665348362006, + "grad_norm": 0.005857539363205433, + "learning_rate": 8.718828341469851e-05, + "loss": 0.0022, + "step": 4280 + }, + { + "epoch": 1.1299848394964076, + "grad_norm": 0.10582758486270905, + "learning_rate": 8.705633988652856e-05, + "loss": 0.002, + "step": 4285 + }, + { + "epoch": 1.1313031441566146, + "grad_norm": 0.006190940272063017, + "learning_rate": 8.692439635835863e-05, + "loss": 0.0022, + "step": 4290 + }, + { + "epoch": 1.1326214488168216, + "grad_norm": 0.00221514655277133, + "learning_rate": 8.679245283018869e-05, + "loss": 0.0314, + "step": 4295 + }, + { + "epoch": 1.1339397534770286, + "grad_norm": 0.0796755850315094, + "learning_rate": 8.666050930201874e-05, + "loss": 0.0347, + "step": 4300 + }, + { + "epoch": 1.1352580581372356, + "grad_norm": 0.20088806748390198, + "learning_rate": 8.65285657738488e-05, + "loss": 0.0048, + "step": 4305 + }, + { + "epoch": 1.1365763627974426, + "grad_norm": 0.4018377363681793, + "learning_rate": 8.639662224567884e-05, + "loss": 0.0234, + "step": 4310 + }, + { + "epoch": 1.1378946674576496, + "grad_norm": 0.014961684122681618, + "learning_rate": 8.626467871750891e-05, + "loss": 0.0033, + "step": 4315 + }, + { + "epoch": 1.1392129721178565, + "grad_norm": 0.004534922540187836, + "learning_rate": 8.613273518933897e-05, + "loss": 0.0021, + "step": 4320 + }, + { + "epoch": 1.1405312767780633, + "grad_norm": 0.06340984255075455, + "learning_rate": 8.600079166116902e-05, + "loss": 0.0538, + "step": 4325 + }, + { + "epoch": 1.1418495814382703, + "grad_norm": 0.007374623324722052, + "learning_rate": 8.586884813299908e-05, + "loss": 0.0157, + "step": 4330 + }, + { + "epoch": 1.1431678860984773, + "grad_norm": 0.02313193492591381, + "learning_rate": 8.573690460482913e-05, + "loss": 0.0307, + "step": 4335 + }, + { + "epoch": 1.1444861907586843, + "grad_norm": 0.014071634039282799, + "learning_rate": 8.560496107665919e-05, + "loss": 0.0058, + "step": 4340 + }, + { + "epoch": 1.1458044954188913, + "grad_norm": 1.4664901494979858, + "learning_rate": 8.547301754848926e-05, + "loss": 0.0566, + "step": 4345 + }, + { + "epoch": 1.1471228000790983, + "grad_norm": 0.023680074140429497, + "learning_rate": 8.534107402031931e-05, + "loss": 0.0048, + "step": 4350 + }, + { + "epoch": 1.1484411047393053, + "grad_norm": 0.012555698864161968, + "learning_rate": 8.520913049214937e-05, + "loss": 0.0076, + "step": 4355 + }, + { + "epoch": 1.1497594093995123, + "grad_norm": 0.013624129816889763, + "learning_rate": 8.507718696397941e-05, + "loss": 0.0373, + "step": 4360 + }, + { + "epoch": 1.1510777140597193, + "grad_norm": 0.015372387133538723, + "learning_rate": 8.494524343580947e-05, + "loss": 0.0147, + "step": 4365 + }, + { + "epoch": 1.1523960187199263, + "grad_norm": 0.3312993347644806, + "learning_rate": 8.481329990763954e-05, + "loss": 0.0299, + "step": 4370 + }, + { + "epoch": 1.1537143233801332, + "grad_norm": 0.023838184773921967, + "learning_rate": 8.468135637946959e-05, + "loss": 0.0226, + "step": 4375 + }, + { + "epoch": 1.15503262804034, + "grad_norm": 0.42516952753067017, + "learning_rate": 8.454941285129965e-05, + "loss": 0.0088, + "step": 4380 + }, + { + "epoch": 1.156350932700547, + "grad_norm": 0.6900278925895691, + "learning_rate": 8.44174693231297e-05, + "loss": 0.0245, + "step": 4385 + }, + { + "epoch": 1.157669237360754, + "grad_norm": 0.2932703197002411, + "learning_rate": 8.428552579495976e-05, + "loss": 0.0207, + "step": 4390 + }, + { + "epoch": 1.158987542020961, + "grad_norm": 0.12942780554294586, + "learning_rate": 8.415358226678982e-05, + "loss": 0.0037, + "step": 4395 + }, + { + "epoch": 1.160305846681168, + "grad_norm": 0.9499046802520752, + "learning_rate": 8.402163873861989e-05, + "loss": 0.0246, + "step": 4400 + }, + { + "epoch": 1.161624151341375, + "grad_norm": 0.008869118988513947, + "learning_rate": 8.388969521044994e-05, + "loss": 0.0171, + "step": 4405 + }, + { + "epoch": 1.162942456001582, + "grad_norm": 1.7409231662750244, + "learning_rate": 8.375775168227998e-05, + "loss": 0.017, + "step": 4410 + }, + { + "epoch": 1.164260760661789, + "grad_norm": 0.0020101398695260286, + "learning_rate": 8.362580815411004e-05, + "loss": 0.0027, + "step": 4415 + }, + { + "epoch": 1.165579065321996, + "grad_norm": 0.0785067081451416, + "learning_rate": 8.34938646259401e-05, + "loss": 0.0043, + "step": 4420 + }, + { + "epoch": 1.166897369982203, + "grad_norm": 0.0029506285209208727, + "learning_rate": 8.336192109777016e-05, + "loss": 0.0109, + "step": 4425 + }, + { + "epoch": 1.16821567464241, + "grad_norm": 0.02216683328151703, + "learning_rate": 8.322997756960022e-05, + "loss": 0.0026, + "step": 4430 + }, + { + "epoch": 1.1695339793026167, + "grad_norm": 0.02216639369726181, + "learning_rate": 8.309803404143027e-05, + "loss": 0.0045, + "step": 4435 + }, + { + "epoch": 1.170852283962824, + "grad_norm": 0.0, + "learning_rate": 8.296609051326033e-05, + "loss": 0.006, + "step": 4440 + }, + { + "epoch": 1.1721705886230307, + "grad_norm": 0.0019736960530281067, + "learning_rate": 8.283414698509039e-05, + "loss": 0.0078, + "step": 4445 + }, + { + "epoch": 1.1734888932832377, + "grad_norm": 0.012957746163010597, + "learning_rate": 8.270220345692044e-05, + "loss": 0.002, + "step": 4450 + }, + { + "epoch": 1.1748071979434447, + "grad_norm": 0.010877869091928005, + "learning_rate": 8.25702599287505e-05, + "loss": 0.0237, + "step": 4455 + }, + { + "epoch": 1.1761255026036517, + "grad_norm": 0.005947659723460674, + "learning_rate": 8.243831640058055e-05, + "loss": 0.0341, + "step": 4460 + }, + { + "epoch": 1.1774438072638587, + "grad_norm": 0.0005026470171287656, + "learning_rate": 8.230637287241061e-05, + "loss": 0.0033, + "step": 4465 + }, + { + "epoch": 1.1787621119240657, + "grad_norm": 0.022054588422179222, + "learning_rate": 8.217442934424066e-05, + "loss": 0.0042, + "step": 4470 + }, + { + "epoch": 1.1800804165842727, + "grad_norm": 0.7929030656814575, + "learning_rate": 8.204248581607072e-05, + "loss": 0.0076, + "step": 4475 + }, + { + "epoch": 1.1813987212444796, + "grad_norm": 0.39052629470825195, + "learning_rate": 8.191054228790078e-05, + "loss": 0.0228, + "step": 4480 + }, + { + "epoch": 1.1827170259046866, + "grad_norm": 0.007177622988820076, + "learning_rate": 8.177859875973084e-05, + "loss": 0.01, + "step": 4485 + }, + { + "epoch": 1.1840353305648936, + "grad_norm": 0.006175135262310505, + "learning_rate": 8.16466552315609e-05, + "loss": 0.0037, + "step": 4490 + }, + { + "epoch": 1.1853536352251006, + "grad_norm": 0.0356481671333313, + "learning_rate": 8.151471170339096e-05, + "loss": 0.0024, + "step": 4495 + }, + { + "epoch": 1.1866719398853074, + "grad_norm": 0.19069480895996094, + "learning_rate": 8.138276817522101e-05, + "loss": 0.0048, + "step": 4500 + }, + { + "epoch": 1.1866719398853074, + "eval_loss": 0.026386437937617302, + "eval_runtime": 452.2896, + "eval_samples_per_second": 7.455, + "eval_steps_per_second": 3.728, + "step": 4500 + }, + { + "epoch": 1.1879902445455144, + "grad_norm": 0.002254961524158716, + "learning_rate": 8.125082464705107e-05, + "loss": 0.0014, + "step": 4505 + }, + { + "epoch": 1.1893085492057214, + "grad_norm": 0.8026870489120483, + "learning_rate": 8.111888111888112e-05, + "loss": 0.0411, + "step": 4510 + }, + { + "epoch": 1.1906268538659284, + "grad_norm": 0.47328072786331177, + "learning_rate": 8.098693759071118e-05, + "loss": 0.0271, + "step": 4515 + }, + { + "epoch": 1.1919451585261354, + "grad_norm": 0.4888288676738739, + "learning_rate": 8.085499406254123e-05, + "loss": 0.039, + "step": 4520 + }, + { + "epoch": 1.1932634631863424, + "grad_norm": 0.000925812462810427, + "learning_rate": 8.072305053437129e-05, + "loss": 0.0461, + "step": 4525 + }, + { + "epoch": 1.1945817678465493, + "grad_norm": 0.12472371757030487, + "learning_rate": 8.059110700620135e-05, + "loss": 0.0037, + "step": 4530 + }, + { + "epoch": 1.1959000725067563, + "grad_norm": 0.002875336678698659, + "learning_rate": 8.04591634780314e-05, + "loss": 0.0425, + "step": 4535 + }, + { + "epoch": 1.1972183771669633, + "grad_norm": 0.042056187987327576, + "learning_rate": 8.032721994986147e-05, + "loss": 0.0068, + "step": 4540 + }, + { + "epoch": 1.1985366818271703, + "grad_norm": 0.157605841755867, + "learning_rate": 8.019527642169153e-05, + "loss": 0.0179, + "step": 4545 + }, + { + "epoch": 1.1998549864873773, + "grad_norm": 0.005153563339263201, + "learning_rate": 8.006333289352158e-05, + "loss": 0.0045, + "step": 4550 + }, + { + "epoch": 1.201173291147584, + "grad_norm": 0.02541598491370678, + "learning_rate": 7.993138936535164e-05, + "loss": 0.0041, + "step": 4555 + }, + { + "epoch": 1.2024915958077913, + "grad_norm": 0.04266195371747017, + "learning_rate": 7.979944583718168e-05, + "loss": 0.0121, + "step": 4560 + }, + { + "epoch": 1.203809900467998, + "grad_norm": 0.36108532547950745, + "learning_rate": 7.966750230901175e-05, + "loss": 0.0147, + "step": 4565 + }, + { + "epoch": 1.205128205128205, + "grad_norm": 0.40405452251434326, + "learning_rate": 7.95355587808418e-05, + "loss": 0.0056, + "step": 4570 + }, + { + "epoch": 1.206446509788412, + "grad_norm": 0.030422702431678772, + "learning_rate": 7.940361525267186e-05, + "loss": 0.0055, + "step": 4575 + }, + { + "epoch": 1.207764814448619, + "grad_norm": 0.014555396512150764, + "learning_rate": 7.927167172450192e-05, + "loss": 0.0029, + "step": 4580 + }, + { + "epoch": 1.209083119108826, + "grad_norm": 0.33962950110435486, + "learning_rate": 7.913972819633197e-05, + "loss": 0.0191, + "step": 4585 + }, + { + "epoch": 1.210401423769033, + "grad_norm": 0.040150560438632965, + "learning_rate": 7.900778466816203e-05, + "loss": 0.0096, + "step": 4590 + }, + { + "epoch": 1.21171972842924, + "grad_norm": 0.2968510091304779, + "learning_rate": 7.88758411399921e-05, + "loss": 0.0311, + "step": 4595 + }, + { + "epoch": 1.213038033089447, + "grad_norm": 0.04709814116358757, + "learning_rate": 7.874389761182215e-05, + "loss": 0.0175, + "step": 4600 + }, + { + "epoch": 1.214356337749654, + "grad_norm": 0.1379537284374237, + "learning_rate": 7.861195408365221e-05, + "loss": 0.02, + "step": 4605 + }, + { + "epoch": 1.215674642409861, + "grad_norm": 0.018291711807250977, + "learning_rate": 7.848001055548225e-05, + "loss": 0.003, + "step": 4610 + }, + { + "epoch": 1.216992947070068, + "grad_norm": 0.041676126420497894, + "learning_rate": 7.83480670273123e-05, + "loss": 0.0054, + "step": 4615 + }, + { + "epoch": 1.2183112517302748, + "grad_norm": 0.0013747498160228133, + "learning_rate": 7.821612349914237e-05, + "loss": 0.0132, + "step": 4620 + }, + { + "epoch": 1.2196295563904818, + "grad_norm": 0.0050489697605371475, + "learning_rate": 7.808417997097243e-05, + "loss": 0.0272, + "step": 4625 + }, + { + "epoch": 1.2209478610506888, + "grad_norm": 0.017974581569433212, + "learning_rate": 7.795223644280249e-05, + "loss": 0.0037, + "step": 4630 + }, + { + "epoch": 1.2222661657108957, + "grad_norm": 0.001916698063723743, + "learning_rate": 7.782029291463254e-05, + "loss": 0.002, + "step": 4635 + }, + { + "epoch": 1.2235844703711027, + "grad_norm": 0.05344574153423309, + "learning_rate": 7.76883493864626e-05, + "loss": 0.0114, + "step": 4640 + }, + { + "epoch": 1.2249027750313097, + "grad_norm": 0.22823786735534668, + "learning_rate": 7.755640585829265e-05, + "loss": 0.0296, + "step": 4645 + }, + { + "epoch": 1.2262210796915167, + "grad_norm": 0.02051074244081974, + "learning_rate": 7.742446233012272e-05, + "loss": 0.0037, + "step": 4650 + }, + { + "epoch": 1.2275393843517237, + "grad_norm": 0.9797061681747437, + "learning_rate": 7.729251880195276e-05, + "loss": 0.011, + "step": 4655 + }, + { + "epoch": 1.2288576890119307, + "grad_norm": 0.0017285927897319198, + "learning_rate": 7.716057527378282e-05, + "loss": 0.0224, + "step": 4660 + }, + { + "epoch": 1.2301759936721377, + "grad_norm": 0.021783018484711647, + "learning_rate": 7.702863174561288e-05, + "loss": 0.0174, + "step": 4665 + }, + { + "epoch": 1.2314942983323447, + "grad_norm": 0.00763307698071003, + "learning_rate": 7.689668821744293e-05, + "loss": 0.0516, + "step": 4670 + }, + { + "epoch": 1.2328126029925515, + "grad_norm": 0.32605209946632385, + "learning_rate": 7.676474468927299e-05, + "loss": 0.0301, + "step": 4675 + }, + { + "epoch": 1.2341309076527585, + "grad_norm": 1.2027722597122192, + "learning_rate": 7.663280116110306e-05, + "loss": 0.0474, + "step": 4680 + }, + { + "epoch": 1.2354492123129655, + "grad_norm": 0.10201717168092728, + "learning_rate": 7.650085763293311e-05, + "loss": 0.0144, + "step": 4685 + }, + { + "epoch": 1.2367675169731724, + "grad_norm": 0.013835664838552475, + "learning_rate": 7.636891410476317e-05, + "loss": 0.0024, + "step": 4690 + }, + { + "epoch": 1.2380858216333794, + "grad_norm": 0.005699916277080774, + "learning_rate": 7.623697057659322e-05, + "loss": 0.0089, + "step": 4695 + }, + { + "epoch": 1.2394041262935864, + "grad_norm": 0.16583332419395447, + "learning_rate": 7.610502704842328e-05, + "loss": 0.019, + "step": 4700 + }, + { + "epoch": 1.2407224309537934, + "grad_norm": 0.2734023332595825, + "learning_rate": 7.597308352025333e-05, + "loss": 0.0041, + "step": 4705 + }, + { + "epoch": 1.2420407356140004, + "grad_norm": 0.04209504276514053, + "learning_rate": 7.584113999208339e-05, + "loss": 0.0292, + "step": 4710 + }, + { + "epoch": 1.2433590402742074, + "grad_norm": 0.0303195733577013, + "learning_rate": 7.570919646391345e-05, + "loss": 0.0019, + "step": 4715 + }, + { + "epoch": 1.2446773449344144, + "grad_norm": 0.014011899940669537, + "learning_rate": 7.55772529357435e-05, + "loss": 0.0236, + "step": 4720 + }, + { + "epoch": 1.2459956495946214, + "grad_norm": 0.37838876247406006, + "learning_rate": 7.544530940757356e-05, + "loss": 0.0081, + "step": 4725 + }, + { + "epoch": 1.2473139542548284, + "grad_norm": 0.003717717481777072, + "learning_rate": 7.531336587940361e-05, + "loss": 0.0036, + "step": 4730 + }, + { + "epoch": 1.2486322589150354, + "grad_norm": 1.2284752130508423, + "learning_rate": 7.518142235123368e-05, + "loss": 0.0089, + "step": 4735 + }, + { + "epoch": 1.2499505635752421, + "grad_norm": 0.015356095507740974, + "learning_rate": 7.504947882306374e-05, + "loss": 0.0074, + "step": 4740 + }, + { + "epoch": 1.2512688682354491, + "grad_norm": 0.0020383282098919153, + "learning_rate": 7.49175352948938e-05, + "loss": 0.0444, + "step": 4745 + }, + { + "epoch": 1.2525871728956561, + "grad_norm": 0.006680132355540991, + "learning_rate": 7.478559176672385e-05, + "loss": 0.009, + "step": 4750 + }, + { + "epoch": 1.2539054775558631, + "grad_norm": 0.01650019735097885, + "learning_rate": 7.465364823855389e-05, + "loss": 0.0022, + "step": 4755 + }, + { + "epoch": 1.2552237822160701, + "grad_norm": 0.009536102414131165, + "learning_rate": 7.452170471038396e-05, + "loss": 0.0026, + "step": 4760 + }, + { + "epoch": 1.256542086876277, + "grad_norm": 0.04677430912852287, + "learning_rate": 7.438976118221402e-05, + "loss": 0.004, + "step": 4765 + }, + { + "epoch": 1.257860391536484, + "grad_norm": 0.007777783088386059, + "learning_rate": 7.425781765404407e-05, + "loss": 0.0112, + "step": 4770 + }, + { + "epoch": 1.259178696196691, + "grad_norm": 0.03724197298288345, + "learning_rate": 7.412587412587413e-05, + "loss": 0.0065, + "step": 4775 + }, + { + "epoch": 1.260497000856898, + "grad_norm": 0.0023958412930369377, + "learning_rate": 7.399393059770418e-05, + "loss": 0.0238, + "step": 4780 + }, + { + "epoch": 1.261815305517105, + "grad_norm": 0.0036889975890517235, + "learning_rate": 7.386198706953424e-05, + "loss": 0.0012, + "step": 4785 + }, + { + "epoch": 1.263133610177312, + "grad_norm": 0.0009220903157256544, + "learning_rate": 7.373004354136431e-05, + "loss": 0.0017, + "step": 4790 + }, + { + "epoch": 1.2644519148375188, + "grad_norm": 0.0033395602367818356, + "learning_rate": 7.359810001319436e-05, + "loss": 0.0474, + "step": 4795 + }, + { + "epoch": 1.265770219497726, + "grad_norm": 0.004093261435627937, + "learning_rate": 7.346615648502442e-05, + "loss": 0.0025, + "step": 4800 + }, + { + "epoch": 1.2670885241579328, + "grad_norm": 0.004395488649606705, + "learning_rate": 7.333421295685446e-05, + "loss": 0.0011, + "step": 4805 + }, + { + "epoch": 1.2684068288181398, + "grad_norm": 0.024034051224589348, + "learning_rate": 7.320226942868452e-05, + "loss": 0.0027, + "step": 4810 + }, + { + "epoch": 1.2697251334783468, + "grad_norm": 0.9501499533653259, + "learning_rate": 7.307032590051459e-05, + "loss": 0.0279, + "step": 4815 + }, + { + "epoch": 1.2710434381385538, + "grad_norm": 0.008805549703538418, + "learning_rate": 7.293838237234464e-05, + "loss": 0.0403, + "step": 4820 + }, + { + "epoch": 1.2723617427987608, + "grad_norm": 0.01750873774290085, + "learning_rate": 7.28064388441747e-05, + "loss": 0.0571, + "step": 4825 + }, + { + "epoch": 1.2736800474589678, + "grad_norm": 0.004490260500460863, + "learning_rate": 7.267449531600475e-05, + "loss": 0.0269, + "step": 4830 + }, + { + "epoch": 1.2749983521191748, + "grad_norm": 0.07510064542293549, + "learning_rate": 7.254255178783481e-05, + "loss": 0.0123, + "step": 4835 + }, + { + "epoch": 1.2763166567793818, + "grad_norm": 0.039783038198947906, + "learning_rate": 7.241060825966486e-05, + "loss": 0.0137, + "step": 4840 + }, + { + "epoch": 1.2776349614395888, + "grad_norm": 0.019004900008440018, + "learning_rate": 7.227866473149493e-05, + "loss": 0.0047, + "step": 4845 + }, + { + "epoch": 1.2789532660997955, + "grad_norm": 0.04813052713871002, + "learning_rate": 7.214672120332499e-05, + "loss": 0.0021, + "step": 4850 + }, + { + "epoch": 1.2802715707600028, + "grad_norm": 0.00835048221051693, + "learning_rate": 7.201477767515503e-05, + "loss": 0.0014, + "step": 4855 + }, + { + "epoch": 1.2815898754202095, + "grad_norm": 0.008609198965132236, + "learning_rate": 7.188283414698509e-05, + "loss": 0.0219, + "step": 4860 + }, + { + "epoch": 1.2829081800804165, + "grad_norm": 0.007337458431720734, + "learning_rate": 7.175089061881514e-05, + "loss": 0.0014, + "step": 4865 + }, + { + "epoch": 1.2842264847406235, + "grad_norm": 0.0032645913306623697, + "learning_rate": 7.161894709064521e-05, + "loss": 0.0026, + "step": 4870 + }, + { + "epoch": 1.2855447894008305, + "grad_norm": 0.27384671568870544, + "learning_rate": 7.148700356247527e-05, + "loss": 0.0227, + "step": 4875 + }, + { + "epoch": 1.2868630940610375, + "grad_norm": 0.03584875538945198, + "learning_rate": 7.135506003430532e-05, + "loss": 0.0299, + "step": 4880 + }, + { + "epoch": 1.2881813987212445, + "grad_norm": 0.03482440486550331, + "learning_rate": 7.122311650613538e-05, + "loss": 0.0125, + "step": 4885 + }, + { + "epoch": 1.2894997033814515, + "grad_norm": 0.005974395200610161, + "learning_rate": 7.109117297796543e-05, + "loss": 0.0029, + "step": 4890 + }, + { + "epoch": 1.2908180080416585, + "grad_norm": 0.01820153370499611, + "learning_rate": 7.095922944979549e-05, + "loss": 0.0254, + "step": 4895 + }, + { + "epoch": 1.2921363127018655, + "grad_norm": 0.1733965277671814, + "learning_rate": 7.082728592162555e-05, + "loss": 0.028, + "step": 4900 + }, + { + "epoch": 1.2934546173620725, + "grad_norm": 1.3017303943634033, + "learning_rate": 7.06953423934556e-05, + "loss": 0.0213, + "step": 4905 + }, + { + "epoch": 1.2947729220222794, + "grad_norm": 0.01360877975821495, + "learning_rate": 7.056339886528566e-05, + "loss": 0.0039, + "step": 4910 + }, + { + "epoch": 1.2960912266824862, + "grad_norm": 0.01503999624401331, + "learning_rate": 7.043145533711571e-05, + "loss": 0.0102, + "step": 4915 + }, + { + "epoch": 1.2974095313426934, + "grad_norm": 0.2200804352760315, + "learning_rate": 7.029951180894577e-05, + "loss": 0.0461, + "step": 4920 + }, + { + "epoch": 1.2987278360029002, + "grad_norm": 0.08512946963310242, + "learning_rate": 7.016756828077582e-05, + "loss": 0.0066, + "step": 4925 + }, + { + "epoch": 1.3000461406631072, + "grad_norm": 0.08296570926904678, + "learning_rate": 7.00356247526059e-05, + "loss": 0.0223, + "step": 4930 + }, + { + "epoch": 1.3013644453233142, + "grad_norm": 0.008866079151630402, + "learning_rate": 6.990368122443595e-05, + "loss": 0.0032, + "step": 4935 + }, + { + "epoch": 1.3026827499835212, + "grad_norm": 0.024493014439940453, + "learning_rate": 6.9771737696266e-05, + "loss": 0.0128, + "step": 4940 + }, + { + "epoch": 1.3040010546437282, + "grad_norm": 0.08965341746807098, + "learning_rate": 6.963979416809606e-05, + "loss": 0.028, + "step": 4945 + }, + { + "epoch": 1.3053193593039352, + "grad_norm": 0.023156631737947464, + "learning_rate": 6.950785063992612e-05, + "loss": 0.0187, + "step": 4950 + }, + { + "epoch": 1.3066376639641422, + "grad_norm": 0.18552155792713165, + "learning_rate": 6.937590711175617e-05, + "loss": 0.0424, + "step": 4955 + }, + { + "epoch": 1.3079559686243492, + "grad_norm": 0.02200198918581009, + "learning_rate": 6.924396358358623e-05, + "loss": 0.0148, + "step": 4960 + }, + { + "epoch": 1.3092742732845561, + "grad_norm": 0.00568364467471838, + "learning_rate": 6.911202005541628e-05, + "loss": 0.0199, + "step": 4965 + }, + { + "epoch": 1.310592577944763, + "grad_norm": 0.021591177210211754, + "learning_rate": 6.898007652724634e-05, + "loss": 0.0092, + "step": 4970 + }, + { + "epoch": 1.3119108826049701, + "grad_norm": 0.327177494764328, + "learning_rate": 6.88481329990764e-05, + "loss": 0.0047, + "step": 4975 + }, + { + "epoch": 1.313229187265177, + "grad_norm": 0.024512887001037598, + "learning_rate": 6.871618947090645e-05, + "loss": 0.0046, + "step": 4980 + }, + { + "epoch": 1.314547491925384, + "grad_norm": 0.05725006014108658, + "learning_rate": 6.858424594273652e-05, + "loss": 0.0227, + "step": 4985 + }, + { + "epoch": 1.3158657965855909, + "grad_norm": 0.011280277743935585, + "learning_rate": 6.845230241456658e-05, + "loss": 0.0056, + "step": 4990 + }, + { + "epoch": 1.3171841012457979, + "grad_norm": 0.022504402324557304, + "learning_rate": 6.832035888639663e-05, + "loss": 0.0029, + "step": 4995 + }, + { + "epoch": 1.3185024059060049, + "grad_norm": 0.02168826013803482, + "learning_rate": 6.818841535822669e-05, + "loss": 0.0198, + "step": 5000 + }, + { + "epoch": 1.3185024059060049, + "eval_loss": 0.025039294734597206, + "eval_runtime": 452.1097, + "eval_samples_per_second": 7.458, + "eval_steps_per_second": 3.729, + "step": 5000 + }, + { + "epoch": 1.3198207105662119, + "grad_norm": 0.0064329709857702255, + "learning_rate": 6.805647183005673e-05, + "loss": 0.0299, + "step": 5005 + }, + { + "epoch": 1.3211390152264189, + "grad_norm": 0.00267885928042233, + "learning_rate": 6.79245283018868e-05, + "loss": 0.0065, + "step": 5010 + }, + { + "epoch": 1.3224573198866258, + "grad_norm": 0.6842889189720154, + "learning_rate": 6.779258477371685e-05, + "loss": 0.008, + "step": 5015 + }, + { + "epoch": 1.3237756245468328, + "grad_norm": 0.002985635306686163, + "learning_rate": 6.766064124554691e-05, + "loss": 0.0119, + "step": 5020 + }, + { + "epoch": 1.3250939292070396, + "grad_norm": 0.019304940477013588, + "learning_rate": 6.752869771737696e-05, + "loss": 0.0041, + "step": 5025 + }, + { + "epoch": 1.3264122338672468, + "grad_norm": 0.011305035091936588, + "learning_rate": 6.739675418920702e-05, + "loss": 0.0031, + "step": 5030 + }, + { + "epoch": 1.3277305385274536, + "grad_norm": 0.006184784695506096, + "learning_rate": 6.726481066103708e-05, + "loss": 0.0081, + "step": 5035 + }, + { + "epoch": 1.3290488431876606, + "grad_norm": 0.0073184361681342125, + "learning_rate": 6.713286713286715e-05, + "loss": 0.0202, + "step": 5040 + }, + { + "epoch": 1.3303671478478676, + "grad_norm": 0.006566181313246489, + "learning_rate": 6.70009236046972e-05, + "loss": 0.0052, + "step": 5045 + }, + { + "epoch": 1.3316854525080746, + "grad_norm": 0.31427526473999023, + "learning_rate": 6.686898007652726e-05, + "loss": 0.017, + "step": 5050 + }, + { + "epoch": 1.3330037571682816, + "grad_norm": 0.005085447803139687, + "learning_rate": 6.67370365483573e-05, + "loss": 0.009, + "step": 5055 + }, + { + "epoch": 1.3343220618284886, + "grad_norm": 0.2745366096496582, + "learning_rate": 6.660509302018735e-05, + "loss": 0.0119, + "step": 5060 + }, + { + "epoch": 1.3356403664886956, + "grad_norm": 0.2871796786785126, + "learning_rate": 6.647314949201742e-05, + "loss": 0.0158, + "step": 5065 + }, + { + "epoch": 1.3369586711489025, + "grad_norm": 0.2774186134338379, + "learning_rate": 6.634120596384748e-05, + "loss": 0.0084, + "step": 5070 + }, + { + "epoch": 1.3382769758091095, + "grad_norm": 0.013278775848448277, + "learning_rate": 6.620926243567753e-05, + "loss": 0.0111, + "step": 5075 + }, + { + "epoch": 1.3395952804693165, + "grad_norm": 0.01614517532289028, + "learning_rate": 6.607731890750759e-05, + "loss": 0.0066, + "step": 5080 + }, + { + "epoch": 1.3409135851295235, + "grad_norm": 0.0037789656780660152, + "learning_rate": 6.594537537933765e-05, + "loss": 0.0142, + "step": 5085 + }, + { + "epoch": 1.3422318897897303, + "grad_norm": 0.03221861273050308, + "learning_rate": 6.58134318511677e-05, + "loss": 0.0155, + "step": 5090 + }, + { + "epoch": 1.3435501944499375, + "grad_norm": 0.005637989845126867, + "learning_rate": 6.568148832299776e-05, + "loss": 0.0022, + "step": 5095 + }, + { + "epoch": 1.3448684991101443, + "grad_norm": 0.0017844432732090354, + "learning_rate": 6.554954479482783e-05, + "loss": 0.0217, + "step": 5100 + }, + { + "epoch": 1.3461868037703513, + "grad_norm": 0.08099021762609482, + "learning_rate": 6.541760126665787e-05, + "loss": 0.0222, + "step": 5105 + }, + { + "epoch": 1.3475051084305583, + "grad_norm": 0.011909045279026031, + "learning_rate": 6.528565773848792e-05, + "loss": 0.0058, + "step": 5110 + }, + { + "epoch": 1.3488234130907653, + "grad_norm": 0.7332578301429749, + "learning_rate": 6.515371421031798e-05, + "loss": 0.0286, + "step": 5115 + }, + { + "epoch": 1.3501417177509722, + "grad_norm": 0.3415885865688324, + "learning_rate": 6.502177068214804e-05, + "loss": 0.1191, + "step": 5120 + }, + { + "epoch": 1.3514600224111792, + "grad_norm": 0.00904211588203907, + "learning_rate": 6.48898271539781e-05, + "loss": 0.0043, + "step": 5125 + }, + { + "epoch": 1.3527783270713862, + "grad_norm": 0.1978830248117447, + "learning_rate": 6.475788362580816e-05, + "loss": 0.0316, + "step": 5130 + }, + { + "epoch": 1.3540966317315932, + "grad_norm": 0.10229042172431946, + "learning_rate": 6.462594009763822e-05, + "loss": 0.0194, + "step": 5135 + }, + { + "epoch": 1.3554149363918002, + "grad_norm": 0.4457210600376129, + "learning_rate": 6.449399656946827e-05, + "loss": 0.0276, + "step": 5140 + }, + { + "epoch": 1.356733241052007, + "grad_norm": 0.023706572130322456, + "learning_rate": 6.436205304129833e-05, + "loss": 0.0163, + "step": 5145 + }, + { + "epoch": 1.3580515457122142, + "grad_norm": 1.166896939277649, + "learning_rate": 6.423010951312838e-05, + "loss": 0.0189, + "step": 5150 + }, + { + "epoch": 1.359369850372421, + "grad_norm": 0.0016115796752274036, + "learning_rate": 6.409816598495844e-05, + "loss": 0.0191, + "step": 5155 + }, + { + "epoch": 1.360688155032628, + "grad_norm": 0.00786682777106762, + "learning_rate": 6.39662224567885e-05, + "loss": 0.0119, + "step": 5160 + }, + { + "epoch": 1.362006459692835, + "grad_norm": 1.042732834815979, + "learning_rate": 6.383427892861855e-05, + "loss": 0.0497, + "step": 5165 + }, + { + "epoch": 1.363324764353042, + "grad_norm": 0.007983304560184479, + "learning_rate": 6.37023354004486e-05, + "loss": 0.044, + "step": 5170 + }, + { + "epoch": 1.364643069013249, + "grad_norm": 0.009767642244696617, + "learning_rate": 6.357039187227866e-05, + "loss": 0.0405, + "step": 5175 + }, + { + "epoch": 1.365961373673456, + "grad_norm": 0.03164628520607948, + "learning_rate": 6.343844834410873e-05, + "loss": 0.0138, + "step": 5180 + }, + { + "epoch": 1.367279678333663, + "grad_norm": 0.004159921780228615, + "learning_rate": 6.330650481593879e-05, + "loss": 0.0045, + "step": 5185 + }, + { + "epoch": 1.36859798299387, + "grad_norm": 0.004395391326397657, + "learning_rate": 6.317456128776884e-05, + "loss": 0.0046, + "step": 5190 + }, + { + "epoch": 1.369916287654077, + "grad_norm": 0.011886746622622013, + "learning_rate": 6.30426177595989e-05, + "loss": 0.0064, + "step": 5195 + }, + { + "epoch": 1.371234592314284, + "grad_norm": 0.2259266972541809, + "learning_rate": 6.291067423142895e-05, + "loss": 0.0076, + "step": 5200 + }, + { + "epoch": 1.372552896974491, + "grad_norm": 0.01407301053404808, + "learning_rate": 6.277873070325901e-05, + "loss": 0.0201, + "step": 5205 + }, + { + "epoch": 1.3738712016346977, + "grad_norm": 0.00911578256636858, + "learning_rate": 6.264678717508906e-05, + "loss": 0.0164, + "step": 5210 + }, + { + "epoch": 1.3751895062949049, + "grad_norm": 0.20968014001846313, + "learning_rate": 6.251484364691912e-05, + "loss": 0.0075, + "step": 5215 + }, + { + "epoch": 1.3765078109551117, + "grad_norm": 0.008801166899502277, + "learning_rate": 6.238290011874918e-05, + "loss": 0.0068, + "step": 5220 + }, + { + "epoch": 1.3778261156153186, + "grad_norm": 0.007181806955486536, + "learning_rate": 6.225095659057923e-05, + "loss": 0.0136, + "step": 5225 + }, + { + "epoch": 1.3791444202755256, + "grad_norm": 0.7527109980583191, + "learning_rate": 6.211901306240929e-05, + "loss": 0.0287, + "step": 5230 + }, + { + "epoch": 1.3804627249357326, + "grad_norm": 0.039015207439661026, + "learning_rate": 6.198706953423936e-05, + "loss": 0.0326, + "step": 5235 + }, + { + "epoch": 1.3817810295959396, + "grad_norm": 0.021076606586575508, + "learning_rate": 6.185512600606941e-05, + "loss": 0.0191, + "step": 5240 + }, + { + "epoch": 1.3830993342561466, + "grad_norm": 0.016630731523036957, + "learning_rate": 6.172318247789947e-05, + "loss": 0.0131, + "step": 5245 + }, + { + "epoch": 1.3844176389163536, + "grad_norm": 0.011133644729852676, + "learning_rate": 6.159123894972952e-05, + "loss": 0.0029, + "step": 5250 + }, + { + "epoch": 1.3857359435765606, + "grad_norm": 0.6434677243232727, + "learning_rate": 6.145929542155957e-05, + "loss": 0.0091, + "step": 5255 + }, + { + "epoch": 1.3870542482367676, + "grad_norm": 0.051020298153162, + "learning_rate": 6.132735189338964e-05, + "loss": 0.0086, + "step": 5260 + }, + { + "epoch": 1.3883725528969744, + "grad_norm": 0.016413932666182518, + "learning_rate": 6.119540836521969e-05, + "loss": 0.0061, + "step": 5265 + }, + { + "epoch": 1.3896908575571816, + "grad_norm": 0.005769540090113878, + "learning_rate": 6.106346483704975e-05, + "loss": 0.0027, + "step": 5270 + }, + { + "epoch": 1.3910091622173884, + "grad_norm": 0.06687796860933304, + "learning_rate": 6.09315213088798e-05, + "loss": 0.0423, + "step": 5275 + }, + { + "epoch": 1.3923274668775953, + "grad_norm": 0.005641553085297346, + "learning_rate": 6.079957778070986e-05, + "loss": 0.0353, + "step": 5280 + }, + { + "epoch": 1.3936457715378023, + "grad_norm": 0.04460568353533745, + "learning_rate": 6.066763425253992e-05, + "loss": 0.0041, + "step": 5285 + }, + { + "epoch": 1.3949640761980093, + "grad_norm": 0.0387534461915493, + "learning_rate": 6.0535690724369976e-05, + "loss": 0.006, + "step": 5290 + }, + { + "epoch": 1.3962823808582163, + "grad_norm": 0.010292598977684975, + "learning_rate": 6.040374719620003e-05, + "loss": 0.0038, + "step": 5295 + }, + { + "epoch": 1.3976006855184233, + "grad_norm": 0.3646155297756195, + "learning_rate": 6.0271803668030094e-05, + "loss": 0.0111, + "step": 5300 + }, + { + "epoch": 1.3989189901786303, + "grad_norm": 0.022035539150238037, + "learning_rate": 6.0139860139860136e-05, + "loss": 0.0507, + "step": 5305 + }, + { + "epoch": 1.4002372948388373, + "grad_norm": 0.003314939560368657, + "learning_rate": 6.00079166116902e-05, + "loss": 0.0132, + "step": 5310 + }, + { + "epoch": 1.4015555994990443, + "grad_norm": 0.0838267058134079, + "learning_rate": 5.9875973083520254e-05, + "loss": 0.0105, + "step": 5315 + }, + { + "epoch": 1.4028739041592513, + "grad_norm": 0.009368584491312504, + "learning_rate": 5.974402955535031e-05, + "loss": 0.0026, + "step": 5320 + }, + { + "epoch": 1.4041922088194583, + "grad_norm": 0.031248098239302635, + "learning_rate": 5.961208602718037e-05, + "loss": 0.0151, + "step": 5325 + }, + { + "epoch": 1.405510513479665, + "grad_norm": 0.06447605788707733, + "learning_rate": 5.948014249901043e-05, + "loss": 0.0219, + "step": 5330 + }, + { + "epoch": 1.4068288181398723, + "grad_norm": 0.010814374312758446, + "learning_rate": 5.9348198970840484e-05, + "loss": 0.0038, + "step": 5335 + }, + { + "epoch": 1.408147122800079, + "grad_norm": 0.6235967874526978, + "learning_rate": 5.9216255442670546e-05, + "loss": 0.0354, + "step": 5340 + }, + { + "epoch": 1.409465427460286, + "grad_norm": 0.026741521432995796, + "learning_rate": 5.90843119145006e-05, + "loss": 0.0032, + "step": 5345 + }, + { + "epoch": 1.410783732120493, + "grad_norm": 0.019413433969020844, + "learning_rate": 5.895236838633066e-05, + "loss": 0.0216, + "step": 5350 + }, + { + "epoch": 1.4121020367807, + "grad_norm": 0.0735543966293335, + "learning_rate": 5.8820424858160706e-05, + "loss": 0.0033, + "step": 5355 + }, + { + "epoch": 1.413420341440907, + "grad_norm": 0.005189546383917332, + "learning_rate": 5.868848132999076e-05, + "loss": 0.021, + "step": 5360 + }, + { + "epoch": 1.414738646101114, + "grad_norm": 0.21240335702896118, + "learning_rate": 5.8556537801820824e-05, + "loss": 0.0294, + "step": 5365 + }, + { + "epoch": 1.416056950761321, + "grad_norm": 0.010165920481085777, + "learning_rate": 5.842459427365088e-05, + "loss": 0.0021, + "step": 5370 + }, + { + "epoch": 1.417375255421528, + "grad_norm": 0.026774069294333458, + "learning_rate": 5.8292650745480936e-05, + "loss": 0.0299, + "step": 5375 + }, + { + "epoch": 1.418693560081735, + "grad_norm": 0.0019810455851256847, + "learning_rate": 5.816070721731099e-05, + "loss": 0.0029, + "step": 5380 + }, + { + "epoch": 1.4200118647419417, + "grad_norm": 0.038888879120349884, + "learning_rate": 5.8028763689141054e-05, + "loss": 0.0069, + "step": 5385 + }, + { + "epoch": 1.421330169402149, + "grad_norm": 0.016180936247110367, + "learning_rate": 5.789682016097111e-05, + "loss": 0.0032, + "step": 5390 + }, + { + "epoch": 1.4226484740623557, + "grad_norm": 0.01119404286146164, + "learning_rate": 5.7764876632801165e-05, + "loss": 0.0024, + "step": 5395 + }, + { + "epoch": 1.4239667787225627, + "grad_norm": 0.010486694052815437, + "learning_rate": 5.763293310463123e-05, + "loss": 0.0324, + "step": 5400 + }, + { + "epoch": 1.4252850833827697, + "grad_norm": 0.005453066434711218, + "learning_rate": 5.750098957646127e-05, + "loss": 0.0038, + "step": 5405 + }, + { + "epoch": 1.4266033880429767, + "grad_norm": 0.17556461691856384, + "learning_rate": 5.736904604829133e-05, + "loss": 0.0305, + "step": 5410 + }, + { + "epoch": 1.4279216927031837, + "grad_norm": 0.03074715845286846, + "learning_rate": 5.723710252012139e-05, + "loss": 0.003, + "step": 5415 + }, + { + "epoch": 1.4292399973633907, + "grad_norm": 1.7238941192626953, + "learning_rate": 5.710515899195144e-05, + "loss": 0.0254, + "step": 5420 + }, + { + "epoch": 1.4305583020235977, + "grad_norm": 0.012462320737540722, + "learning_rate": 5.6973215463781506e-05, + "loss": 0.0018, + "step": 5425 + }, + { + "epoch": 1.4318766066838047, + "grad_norm": 0.021576853469014168, + "learning_rate": 5.684127193561156e-05, + "loss": 0.0472, + "step": 5430 + }, + { + "epoch": 1.4331949113440117, + "grad_norm": 0.2862134575843811, + "learning_rate": 5.670932840744162e-05, + "loss": 0.0258, + "step": 5435 + }, + { + "epoch": 1.4345132160042184, + "grad_norm": 0.28419312834739685, + "learning_rate": 5.657738487927168e-05, + "loss": 0.0053, + "step": 5440 + }, + { + "epoch": 1.4358315206644257, + "grad_norm": 0.013650139793753624, + "learning_rate": 5.6445441351101735e-05, + "loss": 0.0126, + "step": 5445 + }, + { + "epoch": 1.4371498253246324, + "grad_norm": 0.01203097216784954, + "learning_rate": 5.631349782293179e-05, + "loss": 0.0076, + "step": 5450 + }, + { + "epoch": 1.4384681299848394, + "grad_norm": 0.0881054624915123, + "learning_rate": 5.618155429476184e-05, + "loss": 0.0178, + "step": 5455 + }, + { + "epoch": 1.4397864346450464, + "grad_norm": 0.5258516669273376, + "learning_rate": 5.6049610766591895e-05, + "loss": 0.0112, + "step": 5460 + }, + { + "epoch": 1.4411047393052534, + "grad_norm": 0.001202153041958809, + "learning_rate": 5.591766723842196e-05, + "loss": 0.0089, + "step": 5465 + }, + { + "epoch": 1.4424230439654604, + "grad_norm": 0.4498993456363678, + "learning_rate": 5.5785723710252014e-05, + "loss": 0.0252, + "step": 5470 + }, + { + "epoch": 1.4437413486256674, + "grad_norm": 0.17477644979953766, + "learning_rate": 5.565378018208207e-05, + "loss": 0.0169, + "step": 5475 + }, + { + "epoch": 1.4450596532858744, + "grad_norm": 0.019443338736891747, + "learning_rate": 5.552183665391213e-05, + "loss": 0.0019, + "step": 5480 + }, + { + "epoch": 1.4463779579460814, + "grad_norm": 0.005653039086610079, + "learning_rate": 5.538989312574219e-05, + "loss": 0.0231, + "step": 5485 + }, + { + "epoch": 1.4476962626062884, + "grad_norm": 0.01554112322628498, + "learning_rate": 5.525794959757224e-05, + "loss": 0.0167, + "step": 5490 + }, + { + "epoch": 1.4490145672664954, + "grad_norm": 0.044272180646657944, + "learning_rate": 5.5126006069402305e-05, + "loss": 0.007, + "step": 5495 + }, + { + "epoch": 1.4503328719267023, + "grad_norm": 0.014857172966003418, + "learning_rate": 5.499406254123236e-05, + "loss": 0.0045, + "step": 5500 + }, + { + "epoch": 1.4503328719267023, + "eval_loss": 0.02392147295176983, + "eval_runtime": 452.468, + "eval_samples_per_second": 7.452, + "eval_steps_per_second": 3.726, + "step": 5500 + } + ], + "logging_steps": 5, + "max_steps": 7584, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.4390500138875494e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5500/training_args.bin b/checkpoint-5500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4c05e0585b1aef03c0cbe4c50207ce04940bd838 --- /dev/null +++ b/checkpoint-5500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79dfa687fdd0c9908ab6b63535817e7567b29b0b483ac228723218f6f5fdeec5 +size 5688 diff --git a/checkpoint-6000/README.md b/checkpoint-6000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e738b5eda9883f4a45efd9d37b8b8357ba758456 --- /dev/null +++ b/checkpoint-6000/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/Llama-3.2-3B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-6000/adapter_config.json b/checkpoint-6000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..852d02d4dc1204f4332be8b3363041a3d3e3feb3 --- /dev/null +++ b/checkpoint-6000/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/Llama-3.2-3B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "gate_proj", + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-6000/adapter_model.safetensors b/checkpoint-6000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d6d9684b0765ee6c81643fd9cb502fb7d283172b --- /dev/null +++ b/checkpoint-6000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c94d9577c8fc107d77260fe93c7e6bf4506f05cf115e8e2a96b0a12405fe2e9 +size 97307544 diff --git a/checkpoint-6000/optimizer.pt b/checkpoint-6000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..32f6a8ba1056df02ef4fa8061827385ff17d3ac7 --- /dev/null +++ b/checkpoint-6000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5414199359ec49bae145dec180d2ca1c1bdbb4ec6e677c5b95da1ff12fd1714c +size 50866370 diff --git a/checkpoint-6000/rng_state.pth b/checkpoint-6000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..fbd5f3f7b5fbb34dcce042a8b5fbd52d27e3dd9f --- /dev/null +++ b/checkpoint-6000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11fdc5d497281f9a51c9ea6954077328f4dd4c21f9190bcf7927de67f6b12607 +size 14244 diff --git a/checkpoint-6000/scheduler.pt b/checkpoint-6000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d778aef580ceacd2ff7d00869a142021fa5422aa --- /dev/null +++ b/checkpoint-6000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10480d246b702ffffca3bab0af9f17158c0ee228a749d8228f114b0f9150fcf2 +size 1064 diff --git a/checkpoint-6000/special_tokens_map.json b/checkpoint-6000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-6000/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-6000/tokenizer.json b/checkpoint-6000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-6000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-6000/tokenizer_config.json b/checkpoint-6000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfd835b0ef1033e89629af6e13ae45397e9fa2f8 --- /dev/null +++ b/checkpoint-6000/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 July 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-6000/trainer_state.json b/checkpoint-6000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d9172cb767a201bbad00ea4ba9c69c92bfbf1ddd --- /dev/null +++ b/checkpoint-6000/trainer_state.json @@ -0,0 +1,8532 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5821633379473996, + "eval_steps": 500, + "global_step": 6000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001318304660206974, + "grad_norm": 4.59375, + "learning_rate": 0.0002, + "loss": 1.9624, + "step": 5 + }, + { + "epoch": 0.002636609320413948, + "grad_norm": 1.7421875, + "learning_rate": 0.00019986805647183008, + "loss": 0.6513, + "step": 10 + }, + { + "epoch": 0.003954913980620921, + "grad_norm": 1.84375, + "learning_rate": 0.00019973611294366012, + "loss": 0.1146, + "step": 15 + }, + { + "epoch": 0.005273218640827896, + "grad_norm": 1.3203125, + "learning_rate": 0.0001996041694154902, + "loss": 0.0529, + "step": 20 + }, + { + "epoch": 0.006591523301034869, + "grad_norm": 0.40234375, + "learning_rate": 0.00019947222588732023, + "loss": 0.1214, + "step": 25 + }, + { + "epoch": 0.007909827961241843, + "grad_norm": 1.5390625, + "learning_rate": 0.0001993402823591503, + "loss": 0.0919, + "step": 30 + }, + { + "epoch": 0.009228132621448816, + "grad_norm": 0.06201171875, + "learning_rate": 0.00019920833883098034, + "loss": 0.09, + "step": 35 + }, + { + "epoch": 0.010546437281655791, + "grad_norm": 1.53125, + "learning_rate": 0.0001990763953028104, + "loss": 0.1945, + "step": 40 + }, + { + "epoch": 0.011864741941862765, + "grad_norm": 0.2890625, + "learning_rate": 0.00019894445177464048, + "loss": 0.1259, + "step": 45 + }, + { + "epoch": 0.013183046602069738, + "grad_norm": 0.609375, + "learning_rate": 0.00019881250824647052, + "loss": 0.027, + "step": 50 + }, + { + "epoch": 0.014501351262276712, + "grad_norm": 0.369140625, + "learning_rate": 0.00019868056471830057, + "loss": 0.1068, + "step": 55 + }, + { + "epoch": 0.015819655922483685, + "grad_norm": 0.34765625, + "learning_rate": 0.00019854862119013064, + "loss": 0.0542, + "step": 60 + }, + { + "epoch": 0.01713796058269066, + "grad_norm": 0.055419921875, + "learning_rate": 0.00019841667766196068, + "loss": 0.0901, + "step": 65 + }, + { + "epoch": 0.018456265242897632, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00019828473413379075, + "loss": 0.0091, + "step": 70 + }, + { + "epoch": 0.019774569903104607, + "grad_norm": 0.0079345703125, + "learning_rate": 0.0001981527906056208, + "loss": 0.0744, + "step": 75 + }, + { + "epoch": 0.021092874563311582, + "grad_norm": 0.65234375, + "learning_rate": 0.00019802084707745086, + "loss": 0.1108, + "step": 80 + }, + { + "epoch": 0.022411179223518554, + "grad_norm": 0.50390625, + "learning_rate": 0.0001978889035492809, + "loss": 0.0446, + "step": 85 + }, + { + "epoch": 0.02372948388372553, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019775696002111097, + "loss": 0.0982, + "step": 90 + }, + { + "epoch": 0.0250477885439325, + "grad_norm": 0.490234375, + "learning_rate": 0.00019762501649294104, + "loss": 0.1035, + "step": 95 + }, + { + "epoch": 0.026366093204139476, + "grad_norm": 0.12158203125, + "learning_rate": 0.00019749307296477108, + "loss": 0.0401, + "step": 100 + }, + { + "epoch": 0.02768439786434645, + "grad_norm": 0.16015625, + "learning_rate": 0.00019736112943660115, + "loss": 0.0309, + "step": 105 + }, + { + "epoch": 0.029002702524553423, + "grad_norm": 1.359375, + "learning_rate": 0.0001972291859084312, + "loss": 0.1032, + "step": 110 + }, + { + "epoch": 0.0303210071847604, + "grad_norm": 0.52734375, + "learning_rate": 0.00019709724238026126, + "loss": 0.0811, + "step": 115 + }, + { + "epoch": 0.03163931184496737, + "grad_norm": 0.177734375, + "learning_rate": 0.00019696529885209133, + "loss": 0.0258, + "step": 120 + }, + { + "epoch": 0.03295761650517435, + "grad_norm": 0.234375, + "learning_rate": 0.00019683335532392137, + "loss": 0.0437, + "step": 125 + }, + { + "epoch": 0.03427592116538132, + "grad_norm": 1.3046875, + "learning_rate": 0.00019670141179575144, + "loss": 0.0967, + "step": 130 + }, + { + "epoch": 0.03559422582558829, + "grad_norm": 0.2734375, + "learning_rate": 0.00019656946826758148, + "loss": 0.0132, + "step": 135 + }, + { + "epoch": 0.036912530485795264, + "grad_norm": 0.66015625, + "learning_rate": 0.00019643752473941155, + "loss": 0.0396, + "step": 140 + }, + { + "epoch": 0.03823083514600224, + "grad_norm": 1.0546875, + "learning_rate": 0.0001963055812112416, + "loss": 0.0449, + "step": 145 + }, + { + "epoch": 0.039549139806209214, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019617363768307166, + "loss": 0.1196, + "step": 150 + }, + { + "epoch": 0.040867444466416186, + "grad_norm": 0.5859375, + "learning_rate": 0.0001960416941549017, + "loss": 0.0588, + "step": 155 + }, + { + "epoch": 0.042185749126623165, + "grad_norm": 0.06005859375, + "learning_rate": 0.00019590975062673175, + "loss": 0.0234, + "step": 160 + }, + { + "epoch": 0.04350405378683014, + "grad_norm": 0.4921875, + "learning_rate": 0.00019577780709856182, + "loss": 0.0916, + "step": 165 + }, + { + "epoch": 0.04482235844703711, + "grad_norm": 0.84375, + "learning_rate": 0.0001956458635703919, + "loss": 0.0271, + "step": 170 + }, + { + "epoch": 0.04614066310724409, + "grad_norm": 0.8828125, + "learning_rate": 0.00019551392004222193, + "loss": 0.0175, + "step": 175 + }, + { + "epoch": 0.04745896776745106, + "grad_norm": 0.0152587890625, + "learning_rate": 0.000195381976514052, + "loss": 0.0356, + "step": 180 + }, + { + "epoch": 0.04877727242765803, + "grad_norm": 0.09326171875, + "learning_rate": 0.00019525003298588204, + "loss": 0.0057, + "step": 185 + }, + { + "epoch": 0.050095577087865, + "grad_norm": 0.24609375, + "learning_rate": 0.0001951180894577121, + "loss": 0.0082, + "step": 190 + }, + { + "epoch": 0.05141388174807198, + "grad_norm": 0.05029296875, + "learning_rate": 0.00019498614592954215, + "loss": 0.0178, + "step": 195 + }, + { + "epoch": 0.05273218640827895, + "grad_norm": 0.0390625, + "learning_rate": 0.00019485420240137222, + "loss": 0.0789, + "step": 200 + }, + { + "epoch": 0.054050491068485924, + "grad_norm": 0.5625, + "learning_rate": 0.0001947222588732023, + "loss": 0.0645, + "step": 205 + }, + { + "epoch": 0.0553687957286929, + "grad_norm": 0.53515625, + "learning_rate": 0.00019459031534503233, + "loss": 0.116, + "step": 210 + }, + { + "epoch": 0.056687100388899875, + "grad_norm": 0.55078125, + "learning_rate": 0.0001944583718168624, + "loss": 0.0516, + "step": 215 + }, + { + "epoch": 0.058005405049106847, + "grad_norm": 0.314453125, + "learning_rate": 0.00019432642828869244, + "loss": 0.1019, + "step": 220 + }, + { + "epoch": 0.059323709709313825, + "grad_norm": 0.1123046875, + "learning_rate": 0.0001941944847605225, + "loss": 0.0529, + "step": 225 + }, + { + "epoch": 0.0606420143695208, + "grad_norm": 0.4921875, + "learning_rate": 0.00019406254123235256, + "loss": 0.0368, + "step": 230 + }, + { + "epoch": 0.06196031902972777, + "grad_norm": 0.054443359375, + "learning_rate": 0.00019393059770418262, + "loss": 0.037, + "step": 235 + }, + { + "epoch": 0.06327862368993474, + "grad_norm": 0.008544921875, + "learning_rate": 0.0001937986541760127, + "loss": 0.0324, + "step": 240 + }, + { + "epoch": 0.06459692835014172, + "grad_norm": 1.5, + "learning_rate": 0.00019366671064784274, + "loss": 0.0334, + "step": 245 + }, + { + "epoch": 0.0659152330103487, + "grad_norm": 0.2109375, + "learning_rate": 0.0001935347671196728, + "loss": 0.0671, + "step": 250 + }, + { + "epoch": 0.06723353767055566, + "grad_norm": 2.0625, + "learning_rate": 0.00019340282359150285, + "loss": 0.1559, + "step": 255 + }, + { + "epoch": 0.06855184233076264, + "grad_norm": 0.7734375, + "learning_rate": 0.0001932708800633329, + "loss": 0.0198, + "step": 260 + }, + { + "epoch": 0.06987014699096962, + "grad_norm": 0.42578125, + "learning_rate": 0.00019313893653516296, + "loss": 0.0151, + "step": 265 + }, + { + "epoch": 0.07118845165117658, + "grad_norm": 0.1884765625, + "learning_rate": 0.000193006993006993, + "loss": 0.0269, + "step": 270 + }, + { + "epoch": 0.07250675631138356, + "grad_norm": 1.546875, + "learning_rate": 0.00019287504947882307, + "loss": 0.0565, + "step": 275 + }, + { + "epoch": 0.07382506097159053, + "grad_norm": 0.5078125, + "learning_rate": 0.0001927431059506531, + "loss": 0.0942, + "step": 280 + }, + { + "epoch": 0.0751433656317975, + "grad_norm": 0.392578125, + "learning_rate": 0.00019261116242248318, + "loss": 0.0061, + "step": 285 + }, + { + "epoch": 0.07646167029200449, + "grad_norm": 1.9140625, + "learning_rate": 0.00019247921889431325, + "loss": 0.0497, + "step": 290 + }, + { + "epoch": 0.07777997495221145, + "grad_norm": 0.08837890625, + "learning_rate": 0.0001923472753661433, + "loss": 0.0573, + "step": 295 + }, + { + "epoch": 0.07909827961241843, + "grad_norm": 1.046875, + "learning_rate": 0.00019221533183797336, + "loss": 0.0528, + "step": 300 + }, + { + "epoch": 0.08041658427262541, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001920833883098034, + "loss": 0.0506, + "step": 305 + }, + { + "epoch": 0.08173488893283237, + "grad_norm": 0.08203125, + "learning_rate": 0.00019195144478163347, + "loss": 0.0307, + "step": 310 + }, + { + "epoch": 0.08305319359303935, + "grad_norm": 0.111328125, + "learning_rate": 0.00019181950125346354, + "loss": 0.0365, + "step": 315 + }, + { + "epoch": 0.08437149825324633, + "grad_norm": 1.2890625, + "learning_rate": 0.00019168755772529358, + "loss": 0.0447, + "step": 320 + }, + { + "epoch": 0.0856898029134533, + "grad_norm": 0.6015625, + "learning_rate": 0.00019155561419712365, + "loss": 0.0605, + "step": 325 + }, + { + "epoch": 0.08700810757366027, + "grad_norm": 0.71875, + "learning_rate": 0.0001914236706689537, + "loss": 0.0846, + "step": 330 + }, + { + "epoch": 0.08832641223386725, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019129172714078376, + "loss": 0.0713, + "step": 335 + }, + { + "epoch": 0.08964471689407422, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001911597836126138, + "loss": 0.0826, + "step": 340 + }, + { + "epoch": 0.0909630215542812, + "grad_norm": 2.203125, + "learning_rate": 0.00019102784008444388, + "loss": 0.0441, + "step": 345 + }, + { + "epoch": 0.09228132621448817, + "grad_norm": 1.21875, + "learning_rate": 0.00019089589655627395, + "loss": 0.1378, + "step": 350 + }, + { + "epoch": 0.09359963087469514, + "grad_norm": 3.0625, + "learning_rate": 0.00019076395302810396, + "loss": 0.1552, + "step": 355 + }, + { + "epoch": 0.09491793553490212, + "grad_norm": 0.232421875, + "learning_rate": 0.00019063200949993403, + "loss": 0.0458, + "step": 360 + }, + { + "epoch": 0.0962362401951091, + "grad_norm": 0.71875, + "learning_rate": 0.0001905000659717641, + "loss": 0.0312, + "step": 365 + }, + { + "epoch": 0.09755454485531606, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00019036812244359414, + "loss": 0.0247, + "step": 370 + }, + { + "epoch": 0.09887284951552304, + "grad_norm": 0.064453125, + "learning_rate": 0.0001902361789154242, + "loss": 0.054, + "step": 375 + }, + { + "epoch": 0.10019115417573, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019010423538725425, + "loss": 0.0023, + "step": 380 + }, + { + "epoch": 0.10150945883593698, + "grad_norm": 0.0361328125, + "learning_rate": 0.00018997229185908432, + "loss": 0.0884, + "step": 385 + }, + { + "epoch": 0.10282776349614396, + "grad_norm": 1.703125, + "learning_rate": 0.00018984034833091436, + "loss": 0.0506, + "step": 390 + }, + { + "epoch": 0.10414606815635093, + "grad_norm": 0.08837890625, + "learning_rate": 0.00018970840480274443, + "loss": 0.1123, + "step": 395 + }, + { + "epoch": 0.1054643728165579, + "grad_norm": 0.6953125, + "learning_rate": 0.0001895764612745745, + "loss": 0.0597, + "step": 400 + }, + { + "epoch": 0.10678267747676488, + "grad_norm": 0.18359375, + "learning_rate": 0.00018944451774640454, + "loss": 0.0138, + "step": 405 + }, + { + "epoch": 0.10810098213697185, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001893125742182346, + "loss": 0.0249, + "step": 410 + }, + { + "epoch": 0.10941928679717883, + "grad_norm": 0.00970458984375, + "learning_rate": 0.00018918063069006466, + "loss": 0.0084, + "step": 415 + }, + { + "epoch": 0.1107375914573858, + "grad_norm": 0.54296875, + "learning_rate": 0.00018904868716189472, + "loss": 0.0541, + "step": 420 + }, + { + "epoch": 0.11205589611759277, + "grad_norm": 0.74609375, + "learning_rate": 0.00018891674363372477, + "loss": 0.007, + "step": 425 + }, + { + "epoch": 0.11337420077779975, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00018878480010555484, + "loss": 0.0875, + "step": 430 + }, + { + "epoch": 0.11469250543800673, + "grad_norm": 0.9296875, + "learning_rate": 0.0001886528565773849, + "loss": 0.1207, + "step": 435 + }, + { + "epoch": 0.11601081009821369, + "grad_norm": 1.2734375, + "learning_rate": 0.00018852091304921495, + "loss": 0.1143, + "step": 440 + }, + { + "epoch": 0.11732911475842067, + "grad_norm": 0.6484375, + "learning_rate": 0.00018838896952104502, + "loss": 0.0393, + "step": 445 + }, + { + "epoch": 0.11864741941862765, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018825702599287506, + "loss": 0.02, + "step": 450 + }, + { + "epoch": 0.11996572407883462, + "grad_norm": 0.486328125, + "learning_rate": 0.0001881250824647051, + "loss": 0.0891, + "step": 455 + }, + { + "epoch": 0.1212840287390416, + "grad_norm": 1.0, + "learning_rate": 0.00018799313893653517, + "loss": 0.0469, + "step": 460 + }, + { + "epoch": 0.12260233339924857, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001878611954083652, + "loss": 0.019, + "step": 465 + }, + { + "epoch": 0.12392063805945554, + "grad_norm": 0.03857421875, + "learning_rate": 0.00018772925188019528, + "loss": 0.007, + "step": 470 + }, + { + "epoch": 0.12523894271966252, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00018759730835202532, + "loss": 0.0039, + "step": 475 + }, + { + "epoch": 0.12655724737986948, + "grad_norm": 0.014404296875, + "learning_rate": 0.0001874653648238554, + "loss": 0.0043, + "step": 480 + }, + { + "epoch": 0.12787555204007647, + "grad_norm": 0.51953125, + "learning_rate": 0.00018733342129568546, + "loss": 0.1326, + "step": 485 + }, + { + "epoch": 0.12919385670028344, + "grad_norm": 0.99609375, + "learning_rate": 0.0001872014777675155, + "loss": 0.0369, + "step": 490 + }, + { + "epoch": 0.1305121613604904, + "grad_norm": 0.2734375, + "learning_rate": 0.00018706953423934557, + "loss": 0.0395, + "step": 495 + }, + { + "epoch": 0.1318304660206974, + "grad_norm": 0.083984375, + "learning_rate": 0.00018693759071117561, + "loss": 0.0284, + "step": 500 + }, + { + "epoch": 0.1318304660206974, + "eval_loss": 0.04542969539761543, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.5293, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 3.685, + "step": 500 + }, + { + "epoch": 0.13314877068090436, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00018680564718300568, + "loss": 0.0533, + "step": 505 + }, + { + "epoch": 0.13446707534111133, + "grad_norm": 0.71484375, + "learning_rate": 0.00018667370365483575, + "loss": 0.0183, + "step": 510 + }, + { + "epoch": 0.13578538000131832, + "grad_norm": 0.018798828125, + "learning_rate": 0.0001865417601266658, + "loss": 0.0473, + "step": 515 + }, + { + "epoch": 0.13710368466152528, + "grad_norm": 0.388671875, + "learning_rate": 0.00018640981659849586, + "loss": 0.0562, + "step": 520 + }, + { + "epoch": 0.13842198932173225, + "grad_norm": 0.77734375, + "learning_rate": 0.0001862778730703259, + "loss": 0.0755, + "step": 525 + }, + { + "epoch": 0.13974029398193924, + "grad_norm": 2.8125, + "learning_rate": 0.00018614592954215598, + "loss": 0.0422, + "step": 530 + }, + { + "epoch": 0.1410585986421462, + "grad_norm": 0.48828125, + "learning_rate": 0.00018601398601398602, + "loss": 0.0882, + "step": 535 + }, + { + "epoch": 0.14237690330235317, + "grad_norm": 0.16015625, + "learning_rate": 0.0001858820424858161, + "loss": 0.0131, + "step": 540 + }, + { + "epoch": 0.14369520796256013, + "grad_norm": 0.31640625, + "learning_rate": 0.00018575009895764616, + "loss": 0.03, + "step": 545 + }, + { + "epoch": 0.14501351262276713, + "grad_norm": 0.0120849609375, + "learning_rate": 0.0001856181554294762, + "loss": 0.0425, + "step": 550 + }, + { + "epoch": 0.1463318172829741, + "grad_norm": 0.390625, + "learning_rate": 0.00018548621190130624, + "loss": 0.011, + "step": 555 + }, + { + "epoch": 0.14765012194318106, + "grad_norm": 1.9609375, + "learning_rate": 0.0001853542683731363, + "loss": 0.0807, + "step": 560 + }, + { + "epoch": 0.14896842660338805, + "grad_norm": 0.609375, + "learning_rate": 0.00018522232484496635, + "loss": 0.0278, + "step": 565 + }, + { + "epoch": 0.150286731263595, + "grad_norm": 0.087890625, + "learning_rate": 0.00018509038131679642, + "loss": 0.0484, + "step": 570 + }, + { + "epoch": 0.15160503592380198, + "grad_norm": 0.5078125, + "learning_rate": 0.00018495843778862646, + "loss": 0.1277, + "step": 575 + }, + { + "epoch": 0.15292334058400897, + "grad_norm": 0.8125, + "learning_rate": 0.00018482649426045653, + "loss": 0.058, + "step": 580 + }, + { + "epoch": 0.15424164524421594, + "grad_norm": 0.22265625, + "learning_rate": 0.00018469455073228657, + "loss": 0.0259, + "step": 585 + }, + { + "epoch": 0.1555599499044229, + "grad_norm": 1.8984375, + "learning_rate": 0.00018456260720411664, + "loss": 0.113, + "step": 590 + }, + { + "epoch": 0.1568782545646299, + "grad_norm": 0.12451171875, + "learning_rate": 0.0001844306636759467, + "loss": 0.0312, + "step": 595 + }, + { + "epoch": 0.15819655922483686, + "grad_norm": 0.0322265625, + "learning_rate": 0.00018429872014777676, + "loss": 0.0476, + "step": 600 + }, + { + "epoch": 0.15951486388504382, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00018416677661960682, + "loss": 0.0232, + "step": 605 + }, + { + "epoch": 0.16083316854525082, + "grad_norm": 0.57421875, + "learning_rate": 0.00018403483309143687, + "loss": 0.1287, + "step": 610 + }, + { + "epoch": 0.16215147320545778, + "grad_norm": 0.765625, + "learning_rate": 0.00018390288956326694, + "loss": 0.0991, + "step": 615 + }, + { + "epoch": 0.16346977786566474, + "grad_norm": 0.3125, + "learning_rate": 0.00018377094603509698, + "loss": 0.0247, + "step": 620 + }, + { + "epoch": 0.16478808252587174, + "grad_norm": 0.37890625, + "learning_rate": 0.00018363900250692705, + "loss": 0.0632, + "step": 625 + }, + { + "epoch": 0.1661063871860787, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018350705897875712, + "loss": 0.0314, + "step": 630 + }, + { + "epoch": 0.16742469184628567, + "grad_norm": 0.0673828125, + "learning_rate": 0.00018337511545058716, + "loss": 0.0425, + "step": 635 + }, + { + "epoch": 0.16874299650649266, + "grad_norm": 0.396484375, + "learning_rate": 0.00018324317192241723, + "loss": 0.0613, + "step": 640 + }, + { + "epoch": 0.17006130116669962, + "grad_norm": 0.057373046875, + "learning_rate": 0.00018311122839424727, + "loss": 0.0569, + "step": 645 + }, + { + "epoch": 0.1713796058269066, + "grad_norm": 0.001373291015625, + "learning_rate": 0.00018297928486607734, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 0.17269791048711358, + "grad_norm": 1.0859375, + "learning_rate": 0.00018284734133790738, + "loss": 0.0189, + "step": 655 + }, + { + "epoch": 0.17401621514732055, + "grad_norm": 0.6015625, + "learning_rate": 0.00018271539780973742, + "loss": 0.0601, + "step": 660 + }, + { + "epoch": 0.1753345198075275, + "grad_norm": 0.25390625, + "learning_rate": 0.0001825834542815675, + "loss": 0.0211, + "step": 665 + }, + { + "epoch": 0.1766528244677345, + "grad_norm": 2.6875, + "learning_rate": 0.00018245151075339753, + "loss": 0.0713, + "step": 670 + }, + { + "epoch": 0.17797112912794147, + "grad_norm": 1.1875, + "learning_rate": 0.0001823195672252276, + "loss": 0.0522, + "step": 675 + }, + { + "epoch": 0.17928943378814843, + "grad_norm": 0.025146484375, + "learning_rate": 0.00018218762369705767, + "loss": 0.0242, + "step": 680 + }, + { + "epoch": 0.18060773844835543, + "grad_norm": 0.048095703125, + "learning_rate": 0.00018205568016888772, + "loss": 0.0129, + "step": 685 + }, + { + "epoch": 0.1819260431085624, + "grad_norm": 0.04541015625, + "learning_rate": 0.00018192373664071778, + "loss": 0.0142, + "step": 690 + }, + { + "epoch": 0.18324434776876936, + "grad_norm": 0.00830078125, + "learning_rate": 0.00018179179311254783, + "loss": 0.0121, + "step": 695 + }, + { + "epoch": 0.18456265242897635, + "grad_norm": 0.53125, + "learning_rate": 0.0001816598495843779, + "loss": 0.0163, + "step": 700 + }, + { + "epoch": 0.1858809570891833, + "grad_norm": 0.185546875, + "learning_rate": 0.00018152790605620796, + "loss": 0.0203, + "step": 705 + }, + { + "epoch": 0.18719926174939028, + "grad_norm": 1.2578125, + "learning_rate": 0.000181395962528038, + "loss": 0.1548, + "step": 710 + }, + { + "epoch": 0.18851756640959727, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00018126401899986808, + "loss": 0.0543, + "step": 715 + }, + { + "epoch": 0.18983587106980424, + "grad_norm": 0.07568359375, + "learning_rate": 0.00018113207547169812, + "loss": 0.0346, + "step": 720 + }, + { + "epoch": 0.1911541757300112, + "grad_norm": 0.1318359375, + "learning_rate": 0.0001810001319435282, + "loss": 0.03, + "step": 725 + }, + { + "epoch": 0.1924724803902182, + "grad_norm": 0.1455078125, + "learning_rate": 0.00018086818841535823, + "loss": 0.0796, + "step": 730 + }, + { + "epoch": 0.19379078505042516, + "grad_norm": 0.09814453125, + "learning_rate": 0.0001807362448871883, + "loss": 0.0662, + "step": 735 + }, + { + "epoch": 0.19510908971063212, + "grad_norm": 0.91015625, + "learning_rate": 0.00018060430135901837, + "loss": 0.0675, + "step": 740 + }, + { + "epoch": 0.19642739437083911, + "grad_norm": 0.10693359375, + "learning_rate": 0.0001804723578308484, + "loss": 0.0377, + "step": 745 + }, + { + "epoch": 0.19774569903104608, + "grad_norm": 0.95703125, + "learning_rate": 0.00018034041430267848, + "loss": 0.0174, + "step": 750 + }, + { + "epoch": 0.19906400369125304, + "grad_norm": 1.7890625, + "learning_rate": 0.00018020847077450852, + "loss": 0.0278, + "step": 755 + }, + { + "epoch": 0.20038230835146, + "grad_norm": 0.8515625, + "learning_rate": 0.00018007652724633856, + "loss": 0.0113, + "step": 760 + }, + { + "epoch": 0.201700613011667, + "grad_norm": 0.016845703125, + "learning_rate": 0.00017994458371816863, + "loss": 0.0589, + "step": 765 + }, + { + "epoch": 0.20301891767187397, + "grad_norm": 0.01043701171875, + "learning_rate": 0.00017981264018999867, + "loss": 0.0203, + "step": 770 + }, + { + "epoch": 0.20433722233208093, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00017968069666182874, + "loss": 0.0494, + "step": 775 + }, + { + "epoch": 0.20565552699228792, + "grad_norm": 0.56640625, + "learning_rate": 0.00017954875313365879, + "loss": 0.0394, + "step": 780 + }, + { + "epoch": 0.2069738316524949, + "grad_norm": 0.06591796875, + "learning_rate": 0.00017941680960548886, + "loss": 0.0848, + "step": 785 + }, + { + "epoch": 0.20829213631270185, + "grad_norm": 0.40234375, + "learning_rate": 0.00017928486607731892, + "loss": 0.0464, + "step": 790 + }, + { + "epoch": 0.20961044097290885, + "grad_norm": 0.06298828125, + "learning_rate": 0.00017915292254914897, + "loss": 0.0222, + "step": 795 + }, + { + "epoch": 0.2109287456331158, + "grad_norm": 0.5390625, + "learning_rate": 0.00017902097902097904, + "loss": 0.0434, + "step": 800 + }, + { + "epoch": 0.21224705029332278, + "grad_norm": 1.390625, + "learning_rate": 0.00017888903549280908, + "loss": 0.0222, + "step": 805 + }, + { + "epoch": 0.21356535495352977, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00017875709196463915, + "loss": 0.0099, + "step": 810 + }, + { + "epoch": 0.21488365961373673, + "grad_norm": 0.10009765625, + "learning_rate": 0.0001786251484364692, + "loss": 0.0086, + "step": 815 + }, + { + "epoch": 0.2162019642739437, + "grad_norm": 0.06396484375, + "learning_rate": 0.00017849320490829926, + "loss": 0.0715, + "step": 820 + }, + { + "epoch": 0.2175202689341507, + "grad_norm": 0.365234375, + "learning_rate": 0.00017836126138012933, + "loss": 0.0642, + "step": 825 + }, + { + "epoch": 0.21883857359435765, + "grad_norm": 0.01519775390625, + "learning_rate": 0.00017822931785195937, + "loss": 0.0111, + "step": 830 + }, + { + "epoch": 0.22015687825456462, + "grad_norm": 1.1640625, + "learning_rate": 0.00017809737432378944, + "loss": 0.0518, + "step": 835 + }, + { + "epoch": 0.2214751829147716, + "grad_norm": 0.00921630859375, + "learning_rate": 0.00017796543079561948, + "loss": 0.0384, + "step": 840 + }, + { + "epoch": 0.22279348757497858, + "grad_norm": 0.33984375, + "learning_rate": 0.00017783348726744955, + "loss": 0.0204, + "step": 845 + }, + { + "epoch": 0.22411179223518554, + "grad_norm": 0.294921875, + "learning_rate": 0.00017770154373927962, + "loss": 0.0075, + "step": 850 + }, + { + "epoch": 0.22543009689539253, + "grad_norm": 0.033203125, + "learning_rate": 0.00017756960021110963, + "loss": 0.0895, + "step": 855 + }, + { + "epoch": 0.2267484015555995, + "grad_norm": 0.08056640625, + "learning_rate": 0.0001774376566829397, + "loss": 0.1039, + "step": 860 + }, + { + "epoch": 0.22806670621580646, + "grad_norm": 0.55078125, + "learning_rate": 0.00017730571315476975, + "loss": 0.0125, + "step": 865 + }, + { + "epoch": 0.22938501087601346, + "grad_norm": 0.5859375, + "learning_rate": 0.00017717376962659982, + "loss": 0.0381, + "step": 870 + }, + { + "epoch": 0.23070331553622042, + "grad_norm": 0.029052734375, + "learning_rate": 0.00017704182609842988, + "loss": 0.0434, + "step": 875 + }, + { + "epoch": 0.23202162019642739, + "grad_norm": 0.43359375, + "learning_rate": 0.00017690988257025993, + "loss": 0.0799, + "step": 880 + }, + { + "epoch": 0.23333992485663438, + "grad_norm": 0.04150390625, + "learning_rate": 0.00017677793904209, + "loss": 0.0692, + "step": 885 + }, + { + "epoch": 0.23465822951684134, + "grad_norm": 0.435546875, + "learning_rate": 0.00017664599551392004, + "loss": 0.0544, + "step": 890 + }, + { + "epoch": 0.2359765341770483, + "grad_norm": 1.171875, + "learning_rate": 0.0001765140519857501, + "loss": 0.0619, + "step": 895 + }, + { + "epoch": 0.2372948388372553, + "grad_norm": 0.01263427734375, + "learning_rate": 0.00017638210845758018, + "loss": 0.0418, + "step": 900 + }, + { + "epoch": 0.23861314349746227, + "grad_norm": 0.017578125, + "learning_rate": 0.00017625016492941022, + "loss": 0.0195, + "step": 905 + }, + { + "epoch": 0.23993144815766923, + "grad_norm": 0.6171875, + "learning_rate": 0.0001761182214012403, + "loss": 0.067, + "step": 910 + }, + { + "epoch": 0.24124975281787622, + "grad_norm": 0.59765625, + "learning_rate": 0.00017598627787307033, + "loss": 0.049, + "step": 915 + }, + { + "epoch": 0.2425680574780832, + "grad_norm": 1.2421875, + "learning_rate": 0.0001758543343449004, + "loss": 0.0539, + "step": 920 + }, + { + "epoch": 0.24388636213829015, + "grad_norm": 0.10302734375, + "learning_rate": 0.00017572239081673044, + "loss": 0.0725, + "step": 925 + }, + { + "epoch": 0.24520466679849715, + "grad_norm": 0.330078125, + "learning_rate": 0.0001755904472885605, + "loss": 0.064, + "step": 930 + }, + { + "epoch": 0.2465229714587041, + "grad_norm": 0.220703125, + "learning_rate": 0.00017545850376039058, + "loss": 0.0271, + "step": 935 + }, + { + "epoch": 0.24784127611891107, + "grad_norm": 0.01470947265625, + "learning_rate": 0.00017532656023222062, + "loss": 0.0247, + "step": 940 + }, + { + "epoch": 0.24915958077911807, + "grad_norm": 0.013427734375, + "learning_rate": 0.0001751946167040507, + "loss": 0.017, + "step": 945 + }, + { + "epoch": 0.25047788543932503, + "grad_norm": 0.58984375, + "learning_rate": 0.00017506267317588073, + "loss": 0.0254, + "step": 950 + }, + { + "epoch": 0.251796190099532, + "grad_norm": 0.412109375, + "learning_rate": 0.00017493072964771078, + "loss": 0.0186, + "step": 955 + }, + { + "epoch": 0.25311449475973896, + "grad_norm": 0.66796875, + "learning_rate": 0.00017479878611954084, + "loss": 0.0617, + "step": 960 + }, + { + "epoch": 0.25443279941994595, + "grad_norm": 0.322265625, + "learning_rate": 0.00017466684259137089, + "loss": 0.0173, + "step": 965 + }, + { + "epoch": 0.25575110408015295, + "grad_norm": 0.83203125, + "learning_rate": 0.00017453489906320096, + "loss": 0.0512, + "step": 970 + }, + { + "epoch": 0.2570694087403599, + "grad_norm": 0.08447265625, + "learning_rate": 0.000174402955535031, + "loss": 0.0361, + "step": 975 + }, + { + "epoch": 0.2583877134005669, + "grad_norm": 0.423828125, + "learning_rate": 0.00017427101200686107, + "loss": 0.0175, + "step": 980 + }, + { + "epoch": 0.25970601806077387, + "grad_norm": 0.77734375, + "learning_rate": 0.00017413906847869114, + "loss": 0.0139, + "step": 985 + }, + { + "epoch": 0.2610243227209808, + "grad_norm": 0.515625, + "learning_rate": 0.00017400712495052118, + "loss": 0.0948, + "step": 990 + }, + { + "epoch": 0.2623426273811878, + "grad_norm": 1.421875, + "learning_rate": 0.00017387518142235125, + "loss": 0.0406, + "step": 995 + }, + { + "epoch": 0.2636609320413948, + "grad_norm": 0.058837890625, + "learning_rate": 0.0001737432378941813, + "loss": 0.1011, + "step": 1000 + }, + { + "epoch": 0.2636609320413948, + "eval_loss": 0.045552924275398254, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.6113, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 3.684, + "step": 1000 + }, + { + "epoch": 0.26497923670160173, + "grad_norm": 0.380859375, + "learning_rate": 0.00017361129436601136, + "loss": 0.0711, + "step": 1005 + }, + { + "epoch": 0.2662975413618087, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00017347935083784143, + "loss": 0.0218, + "step": 1010 + }, + { + "epoch": 0.2676158460220157, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017334740730967147, + "loss": 0.0301, + "step": 1015 + }, + { + "epoch": 0.26893415068222265, + "grad_norm": 0.2734375, + "learning_rate": 0.00017321546378150154, + "loss": 0.0721, + "step": 1020 + }, + { + "epoch": 0.27025245534242964, + "grad_norm": 0.25390625, + "learning_rate": 0.00017308352025333158, + "loss": 0.0363, + "step": 1025 + }, + { + "epoch": 0.27157076000263664, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017295157672516165, + "loss": 0.0313, + "step": 1030 + }, + { + "epoch": 0.2728890646628436, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0001728196331969917, + "loss": 0.0385, + "step": 1035 + }, + { + "epoch": 0.27420736932305056, + "grad_norm": 0.00787353515625, + "learning_rate": 0.00017268768966882176, + "loss": 0.0405, + "step": 1040 + }, + { + "epoch": 0.27552567398325756, + "grad_norm": 0.484375, + "learning_rate": 0.00017255574614065183, + "loss": 0.0616, + "step": 1045 + }, + { + "epoch": 0.2768439786434645, + "grad_norm": 0.0908203125, + "learning_rate": 0.00017242380261248185, + "loss": 0.0057, + "step": 1050 + }, + { + "epoch": 0.2781622833036715, + "grad_norm": 0.1904296875, + "learning_rate": 0.00017229185908431192, + "loss": 0.0417, + "step": 1055 + }, + { + "epoch": 0.2794805879638785, + "grad_norm": 0.30078125, + "learning_rate": 0.00017215991555614196, + "loss": 0.0346, + "step": 1060 + }, + { + "epoch": 0.2807988926240854, + "grad_norm": 0.016357421875, + "learning_rate": 0.00017202797202797203, + "loss": 0.0295, + "step": 1065 + }, + { + "epoch": 0.2821171972842924, + "grad_norm": 0.490234375, + "learning_rate": 0.0001718960284998021, + "loss": 0.0448, + "step": 1070 + }, + { + "epoch": 0.28343550194449935, + "grad_norm": 0.004241943359375, + "learning_rate": 0.00017176408497163214, + "loss": 0.0051, + "step": 1075 + }, + { + "epoch": 0.28475380660470634, + "grad_norm": 0.01904296875, + "learning_rate": 0.0001716321414434622, + "loss": 0.0894, + "step": 1080 + }, + { + "epoch": 0.28607211126491333, + "grad_norm": 0.83984375, + "learning_rate": 0.00017150019791529225, + "loss": 0.0288, + "step": 1085 + }, + { + "epoch": 0.28739041592512027, + "grad_norm": 0.2021484375, + "learning_rate": 0.00017136825438712232, + "loss": 0.0222, + "step": 1090 + }, + { + "epoch": 0.28870872058532726, + "grad_norm": 0.322265625, + "learning_rate": 0.0001712363108589524, + "loss": 0.0444, + "step": 1095 + }, + { + "epoch": 0.29002702524553425, + "grad_norm": 0.408203125, + "learning_rate": 0.00017110436733078243, + "loss": 0.0828, + "step": 1100 + }, + { + "epoch": 0.2913453299057412, + "grad_norm": 0.04052734375, + "learning_rate": 0.0001709724238026125, + "loss": 0.0725, + "step": 1105 + }, + { + "epoch": 0.2926636345659482, + "grad_norm": 0.2578125, + "learning_rate": 0.00017084048027444254, + "loss": 0.0204, + "step": 1110 + }, + { + "epoch": 0.2939819392261552, + "grad_norm": 0.67578125, + "learning_rate": 0.0001707085367462726, + "loss": 0.0503, + "step": 1115 + }, + { + "epoch": 0.2953002438863621, + "grad_norm": 0.0059814453125, + "learning_rate": 0.00017057659321810265, + "loss": 0.0144, + "step": 1120 + }, + { + "epoch": 0.2966185485465691, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00017044464968993272, + "loss": 0.0044, + "step": 1125 + }, + { + "epoch": 0.2979368532067761, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001703127061617628, + "loss": 0.013, + "step": 1130 + }, + { + "epoch": 0.29925515786698303, + "grad_norm": 0.287109375, + "learning_rate": 0.00017018076263359283, + "loss": 0.0245, + "step": 1135 + }, + { + "epoch": 0.30057346252719, + "grad_norm": 0.26171875, + "learning_rate": 0.0001700488191054229, + "loss": 0.0247, + "step": 1140 + }, + { + "epoch": 0.301891767187397, + "grad_norm": 0.40625, + "learning_rate": 0.00016991687557725294, + "loss": 0.0402, + "step": 1145 + }, + { + "epoch": 0.30321007184760396, + "grad_norm": 1.2578125, + "learning_rate": 0.000169784932049083, + "loss": 0.0071, + "step": 1150 + }, + { + "epoch": 0.30452837650781095, + "grad_norm": 0.330078125, + "learning_rate": 0.00016965298852091306, + "loss": 0.0177, + "step": 1155 + }, + { + "epoch": 0.30584668116801794, + "grad_norm": 0.07275390625, + "learning_rate": 0.0001695210449927431, + "loss": 0.0029, + "step": 1160 + }, + { + "epoch": 0.3071649858282249, + "grad_norm": 0.455078125, + "learning_rate": 0.00016938910146457317, + "loss": 0.0262, + "step": 1165 + }, + { + "epoch": 0.30848329048843187, + "grad_norm": 0.002655029296875, + "learning_rate": 0.0001692571579364032, + "loss": 0.0346, + "step": 1170 + }, + { + "epoch": 0.30980159514863886, + "grad_norm": 0.1748046875, + "learning_rate": 0.00016912521440823328, + "loss": 0.0494, + "step": 1175 + }, + { + "epoch": 0.3111198998088458, + "grad_norm": 1.4609375, + "learning_rate": 0.00016899327088006335, + "loss": 0.0603, + "step": 1180 + }, + { + "epoch": 0.3124382044690528, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001688613273518934, + "loss": 0.0366, + "step": 1185 + }, + { + "epoch": 0.3137565091292598, + "grad_norm": 0.01422119140625, + "learning_rate": 0.00016872938382372346, + "loss": 0.0678, + "step": 1190 + }, + { + "epoch": 0.3150748137894667, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001685974402955535, + "loss": 0.0359, + "step": 1195 + }, + { + "epoch": 0.3163931184496737, + "grad_norm": 0.275390625, + "learning_rate": 0.00016846549676738357, + "loss": 0.1099, + "step": 1200 + }, + { + "epoch": 0.3177114231098807, + "grad_norm": 0.212890625, + "learning_rate": 0.00016833355323921364, + "loss": 0.0343, + "step": 1205 + }, + { + "epoch": 0.31902972777008765, + "grad_norm": 0.0302734375, + "learning_rate": 0.00016820160971104368, + "loss": 0.0138, + "step": 1210 + }, + { + "epoch": 0.32034803243029464, + "grad_norm": 0.016845703125, + "learning_rate": 0.00016806966618287375, + "loss": 0.0202, + "step": 1215 + }, + { + "epoch": 0.32166633709050163, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001679377226547038, + "loss": 0.0442, + "step": 1220 + }, + { + "epoch": 0.32298464175070857, + "grad_norm": 0.049072265625, + "learning_rate": 0.00016780577912653386, + "loss": 0.0375, + "step": 1225 + }, + { + "epoch": 0.32430294641091556, + "grad_norm": 0.1337890625, + "learning_rate": 0.0001676738355983639, + "loss": 0.01, + "step": 1230 + }, + { + "epoch": 0.32562125107112255, + "grad_norm": 0.02197265625, + "learning_rate": 0.00016754189207019397, + "loss": 0.0139, + "step": 1235 + }, + { + "epoch": 0.3269395557313295, + "grad_norm": 0.09228515625, + "learning_rate": 0.00016740994854202404, + "loss": 0.014, + "step": 1240 + }, + { + "epoch": 0.3282578603915365, + "grad_norm": 0.47265625, + "learning_rate": 0.00016727800501385408, + "loss": 0.1546, + "step": 1245 + }, + { + "epoch": 0.3295761650517435, + "grad_norm": 0.02294921875, + "learning_rate": 0.00016714606148568413, + "loss": 0.0803, + "step": 1250 + }, + { + "epoch": 0.3308944697119504, + "grad_norm": 0.185546875, + "learning_rate": 0.00016701411795751417, + "loss": 0.0376, + "step": 1255 + }, + { + "epoch": 0.3322127743721574, + "grad_norm": 0.1123046875, + "learning_rate": 0.00016688217442934424, + "loss": 0.0375, + "step": 1260 + }, + { + "epoch": 0.3335310790323644, + "grad_norm": 1.03125, + "learning_rate": 0.0001667502309011743, + "loss": 0.0442, + "step": 1265 + }, + { + "epoch": 0.33484938369257133, + "grad_norm": 0.0172119140625, + "learning_rate": 0.00016661828737300435, + "loss": 0.0261, + "step": 1270 + }, + { + "epoch": 0.3361676883527783, + "grad_norm": 0.42578125, + "learning_rate": 0.00016648634384483442, + "loss": 0.0553, + "step": 1275 + }, + { + "epoch": 0.3374859930129853, + "grad_norm": 0.1328125, + "learning_rate": 0.00016635440031666446, + "loss": 0.0065, + "step": 1280 + }, + { + "epoch": 0.33880429767319226, + "grad_norm": 0.263671875, + "learning_rate": 0.00016622245678849453, + "loss": 0.0527, + "step": 1285 + }, + { + "epoch": 0.34012260233339925, + "grad_norm": 0.314453125, + "learning_rate": 0.0001660905132603246, + "loss": 0.0297, + "step": 1290 + }, + { + "epoch": 0.34144090699360624, + "grad_norm": 0.04345703125, + "learning_rate": 0.00016595856973215464, + "loss": 0.0477, + "step": 1295 + }, + { + "epoch": 0.3427592116538132, + "grad_norm": 0.08154296875, + "learning_rate": 0.0001658266262039847, + "loss": 0.0298, + "step": 1300 + }, + { + "epoch": 0.34407751631402017, + "grad_norm": 0.08935546875, + "learning_rate": 0.00016569468267581475, + "loss": 0.0481, + "step": 1305 + }, + { + "epoch": 0.34539582097422716, + "grad_norm": 0.06640625, + "learning_rate": 0.00016556273914764482, + "loss": 0.0153, + "step": 1310 + }, + { + "epoch": 0.3467141256344341, + "grad_norm": 0.00592041015625, + "learning_rate": 0.00016543079561947486, + "loss": 0.0111, + "step": 1315 + }, + { + "epoch": 0.3480324302946411, + "grad_norm": 0.2236328125, + "learning_rate": 0.00016529885209130493, + "loss": 0.0309, + "step": 1320 + }, + { + "epoch": 0.3493507349548481, + "grad_norm": 0.0198974609375, + "learning_rate": 0.000165166908563135, + "loss": 0.0579, + "step": 1325 + }, + { + "epoch": 0.350669039615055, + "grad_norm": 0.10107421875, + "learning_rate": 0.00016503496503496504, + "loss": 0.0055, + "step": 1330 + }, + { + "epoch": 0.351987344275262, + "grad_norm": 0.71875, + "learning_rate": 0.00016490302150679511, + "loss": 0.0299, + "step": 1335 + }, + { + "epoch": 0.353305648935469, + "grad_norm": 0.01348876953125, + "learning_rate": 0.00016477107797862516, + "loss": 0.0943, + "step": 1340 + }, + { + "epoch": 0.35462395359567594, + "grad_norm": 0.3046875, + "learning_rate": 0.00016463913445045523, + "loss": 0.0216, + "step": 1345 + }, + { + "epoch": 0.35594225825588294, + "grad_norm": 0.02392578125, + "learning_rate": 0.00016450719092228527, + "loss": 0.0265, + "step": 1350 + }, + { + "epoch": 0.35726056291608993, + "grad_norm": 0.453125, + "learning_rate": 0.0001643752473941153, + "loss": 0.0539, + "step": 1355 + }, + { + "epoch": 0.35857886757629687, + "grad_norm": 0.00823974609375, + "learning_rate": 0.00016424330386594538, + "loss": 0.0139, + "step": 1360 + }, + { + "epoch": 0.35989717223650386, + "grad_norm": 0.55859375, + "learning_rate": 0.00016411136033777542, + "loss": 0.0428, + "step": 1365 + }, + { + "epoch": 0.36121547689671085, + "grad_norm": 0.052734375, + "learning_rate": 0.0001639794168096055, + "loss": 0.0346, + "step": 1370 + }, + { + "epoch": 0.3625337815569178, + "grad_norm": 0.12158203125, + "learning_rate": 0.00016384747328143556, + "loss": 0.0095, + "step": 1375 + }, + { + "epoch": 0.3638520862171248, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0001637155297532656, + "loss": 0.0224, + "step": 1380 + }, + { + "epoch": 0.3651703908773318, + "grad_norm": 0.01318359375, + "learning_rate": 0.00016358358622509567, + "loss": 0.0316, + "step": 1385 + }, + { + "epoch": 0.3664886955375387, + "grad_norm": 0.011962890625, + "learning_rate": 0.0001634516426969257, + "loss": 0.0051, + "step": 1390 + }, + { + "epoch": 0.3678070001977457, + "grad_norm": 0.00396728515625, + "learning_rate": 0.00016331969916875578, + "loss": 0.038, + "step": 1395 + }, + { + "epoch": 0.3691253048579527, + "grad_norm": 0.375, + "learning_rate": 0.00016318775564058585, + "loss": 0.029, + "step": 1400 + }, + { + "epoch": 0.37044360951815963, + "grad_norm": 0.265625, + "learning_rate": 0.0001630558121124159, + "loss": 0.0072, + "step": 1405 + }, + { + "epoch": 0.3717619141783666, + "grad_norm": 0.00127410888671875, + "learning_rate": 0.00016292386858424596, + "loss": 0.0381, + "step": 1410 + }, + { + "epoch": 0.3730802188385736, + "grad_norm": 1.15625, + "learning_rate": 0.000162791925056076, + "loss": 0.0573, + "step": 1415 + }, + { + "epoch": 0.37439852349878056, + "grad_norm": 0.0244140625, + "learning_rate": 0.00016265998152790607, + "loss": 0.051, + "step": 1420 + }, + { + "epoch": 0.37571682815898755, + "grad_norm": 0.0015106201171875, + "learning_rate": 0.00016252803799973612, + "loss": 0.0239, + "step": 1425 + }, + { + "epoch": 0.37703513281919454, + "grad_norm": 0.26953125, + "learning_rate": 0.00016239609447156618, + "loss": 0.0165, + "step": 1430 + }, + { + "epoch": 0.3783534374794015, + "grad_norm": 0.006134033203125, + "learning_rate": 0.00016226415094339625, + "loss": 0.0071, + "step": 1435 + }, + { + "epoch": 0.37967174213960847, + "grad_norm": 2.828125, + "learning_rate": 0.0001621322074152263, + "loss": 0.0272, + "step": 1440 + }, + { + "epoch": 0.38099004679981546, + "grad_norm": 0.349609375, + "learning_rate": 0.00016200026388705637, + "loss": 0.0647, + "step": 1445 + }, + { + "epoch": 0.3823083514600224, + "grad_norm": 0.09326171875, + "learning_rate": 0.00016186832035888638, + "loss": 0.0262, + "step": 1450 + }, + { + "epoch": 0.3836266561202294, + "grad_norm": 0.041015625, + "learning_rate": 0.00016173637683071645, + "loss": 0.0576, + "step": 1455 + }, + { + "epoch": 0.3849449607804364, + "grad_norm": 0.033935546875, + "learning_rate": 0.00016160443330254652, + "loss": 0.0142, + "step": 1460 + }, + { + "epoch": 0.3862632654406433, + "grad_norm": 0.09130859375, + "learning_rate": 0.00016147248977437656, + "loss": 0.0348, + "step": 1465 + }, + { + "epoch": 0.3875815701008503, + "grad_norm": 2.390625, + "learning_rate": 0.00016134054624620663, + "loss": 0.0672, + "step": 1470 + }, + { + "epoch": 0.3888998747610573, + "grad_norm": 0.439453125, + "learning_rate": 0.00016120860271803667, + "loss": 0.0121, + "step": 1475 + }, + { + "epoch": 0.39021817942126424, + "grad_norm": 0.1298828125, + "learning_rate": 0.00016107665918986674, + "loss": 0.0114, + "step": 1480 + }, + { + "epoch": 0.39153648408147124, + "grad_norm": 0.85546875, + "learning_rate": 0.0001609447156616968, + "loss": 0.0968, + "step": 1485 + }, + { + "epoch": 0.39285478874167823, + "grad_norm": 0.703125, + "learning_rate": 0.00016081277213352685, + "loss": 0.0349, + "step": 1490 + }, + { + "epoch": 0.39417309340188517, + "grad_norm": 0.021728515625, + "learning_rate": 0.00016068082860535692, + "loss": 0.0106, + "step": 1495 + }, + { + "epoch": 0.39549139806209216, + "grad_norm": 0.7265625, + "learning_rate": 0.00016054888507718696, + "loss": 0.0225, + "step": 1500 + }, + { + "epoch": 0.39549139806209216, + "eval_loss": 0.03515048325061798, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.3497, + "eval_samples_per_second": 7.373, + "eval_steps_per_second": 3.686, + "step": 1500 + }, + { + "epoch": 0.3968097027222991, + "grad_norm": 0.016519820317626, + "learning_rate": 0.00016041694154901703, + "loss": 0.0202, + "step": 1505 + }, + { + "epoch": 0.3981280073825061, + "grad_norm": 0.8505942225456238, + "learning_rate": 0.00016028499802084708, + "loss": 0.0541, + "step": 1510 + }, + { + "epoch": 0.3994463120427131, + "grad_norm": 0.04163295030593872, + "learning_rate": 0.00016015305449267714, + "loss": 0.0037, + "step": 1515 + }, + { + "epoch": 0.40076461670292, + "grad_norm": 0.011332935653626919, + "learning_rate": 0.00016002111096450721, + "loss": 0.0459, + "step": 1520 + }, + { + "epoch": 0.402082921363127, + "grad_norm": 0.9360129833221436, + "learning_rate": 0.00015988916743633726, + "loss": 0.013, + "step": 1525 + }, + { + "epoch": 0.403401226023334, + "grad_norm": 0.11991436779499054, + "learning_rate": 0.00015975722390816733, + "loss": 0.0079, + "step": 1530 + }, + { + "epoch": 0.40471953068354094, + "grad_norm": 0.36911076307296753, + "learning_rate": 0.00015962528037999737, + "loss": 0.0638, + "step": 1535 + }, + { + "epoch": 0.40603783534374793, + "grad_norm": 0.020278634503483772, + "learning_rate": 0.00015949333685182744, + "loss": 0.0217, + "step": 1540 + }, + { + "epoch": 0.4073561400039549, + "grad_norm": 0.14263059198856354, + "learning_rate": 0.0001593613933236575, + "loss": 0.0495, + "step": 1545 + }, + { + "epoch": 0.40867444466416186, + "grad_norm": 0.09494803845882416, + "learning_rate": 0.00015922944979548752, + "loss": 0.0248, + "step": 1550 + }, + { + "epoch": 0.40999274932436885, + "grad_norm": 0.23064319789409637, + "learning_rate": 0.0001590975062673176, + "loss": 0.0285, + "step": 1555 + }, + { + "epoch": 0.41131105398457585, + "grad_norm": 0.32220256328582764, + "learning_rate": 0.00015896556273914763, + "loss": 0.0537, + "step": 1560 + }, + { + "epoch": 0.4126293586447828, + "grad_norm": 0.41208815574645996, + "learning_rate": 0.0001588336192109777, + "loss": 0.0453, + "step": 1565 + }, + { + "epoch": 0.4139476633049898, + "grad_norm": 0.03775424137711525, + "learning_rate": 0.00015870167568280777, + "loss": 0.0134, + "step": 1570 + }, + { + "epoch": 0.41526596796519677, + "grad_norm": 0.6526333093643188, + "learning_rate": 0.0001585697321546378, + "loss": 0.0329, + "step": 1575 + }, + { + "epoch": 0.4165842726254037, + "grad_norm": 1.001305103302002, + "learning_rate": 0.00015843778862646788, + "loss": 0.0912, + "step": 1580 + }, + { + "epoch": 0.4179025772856107, + "grad_norm": 0.4055219888687134, + "learning_rate": 0.00015830584509829792, + "loss": 0.0519, + "step": 1585 + }, + { + "epoch": 0.4192208819458177, + "grad_norm": 0.035015616565942764, + "learning_rate": 0.000158173901570128, + "loss": 0.0191, + "step": 1590 + }, + { + "epoch": 0.42053918660602463, + "grad_norm": 0.09326844662427902, + "learning_rate": 0.00015804195804195806, + "loss": 0.0106, + "step": 1595 + }, + { + "epoch": 0.4218574912662316, + "grad_norm": 0.06223440542817116, + "learning_rate": 0.0001579100145137881, + "loss": 0.0113, + "step": 1600 + }, + { + "epoch": 0.4231757959264386, + "grad_norm": 0.0625135526061058, + "learning_rate": 0.00015777807098561817, + "loss": 0.0191, + "step": 1605 + }, + { + "epoch": 0.42449410058664555, + "grad_norm": 0.2645983099937439, + "learning_rate": 0.00015764612745744822, + "loss": 0.0829, + "step": 1610 + }, + { + "epoch": 0.42581240524685254, + "grad_norm": 0.009632415138185024, + "learning_rate": 0.00015751418392927829, + "loss": 0.0542, + "step": 1615 + }, + { + "epoch": 0.42713070990705954, + "grad_norm": 0.01979319378733635, + "learning_rate": 0.00015738224040110833, + "loss": 0.0517, + "step": 1620 + }, + { + "epoch": 0.4284490145672665, + "grad_norm": 0.3065454065799713, + "learning_rate": 0.0001572502968729384, + "loss": 0.0738, + "step": 1625 + }, + { + "epoch": 0.42976731922747347, + "grad_norm": 0.09581473469734192, + "learning_rate": 0.00015711835334476847, + "loss": 0.0571, + "step": 1630 + }, + { + "epoch": 0.43108562388768046, + "grad_norm": 0.23746591806411743, + "learning_rate": 0.0001569864098165985, + "loss": 0.0128, + "step": 1635 + }, + { + "epoch": 0.4324039285478874, + "grad_norm": 0.936278760433197, + "learning_rate": 0.00015685446628842858, + "loss": 0.0665, + "step": 1640 + }, + { + "epoch": 0.4337222332080944, + "grad_norm": 0.18487441539764404, + "learning_rate": 0.00015672252276025862, + "loss": 0.0527, + "step": 1645 + }, + { + "epoch": 0.4350405378683014, + "grad_norm": 0.6980624794960022, + "learning_rate": 0.00015659057923208866, + "loss": 0.0613, + "step": 1650 + }, + { + "epoch": 0.4363588425285083, + "grad_norm": 0.4696301221847534, + "learning_rate": 0.00015645863570391873, + "loss": 0.0569, + "step": 1655 + }, + { + "epoch": 0.4376771471887153, + "grad_norm": 0.15083105862140656, + "learning_rate": 0.00015632669217574877, + "loss": 0.0394, + "step": 1660 + }, + { + "epoch": 0.4389954518489223, + "grad_norm": 0.44701239466667175, + "learning_rate": 0.00015619474864757884, + "loss": 0.0494, + "step": 1665 + }, + { + "epoch": 0.44031375650912924, + "grad_norm": 0.07418403029441833, + "learning_rate": 0.00015606280511940888, + "loss": 0.0291, + "step": 1670 + }, + { + "epoch": 0.44163206116933623, + "grad_norm": 0.02311861515045166, + "learning_rate": 0.00015593086159123895, + "loss": 0.0304, + "step": 1675 + }, + { + "epoch": 0.4429503658295432, + "grad_norm": 0.4416038990020752, + "learning_rate": 0.00015579891806306902, + "loss": 0.0176, + "step": 1680 + }, + { + "epoch": 0.44426867048975016, + "grad_norm": 0.5124915242195129, + "learning_rate": 0.00015566697453489906, + "loss": 0.0454, + "step": 1685 + }, + { + "epoch": 0.44558697514995715, + "grad_norm": 0.3159286081790924, + "learning_rate": 0.00015553503100672913, + "loss": 0.047, + "step": 1690 + }, + { + "epoch": 0.44690527981016415, + "grad_norm": 0.032126396894454956, + "learning_rate": 0.00015540308747855918, + "loss": 0.0151, + "step": 1695 + }, + { + "epoch": 0.4482235844703711, + "grad_norm": 0.04663548618555069, + "learning_rate": 0.00015527114395038924, + "loss": 0.0375, + "step": 1700 + }, + { + "epoch": 0.4495418891305781, + "grad_norm": 0.013753900304436684, + "learning_rate": 0.0001551392004222193, + "loss": 0.0485, + "step": 1705 + }, + { + "epoch": 0.45086019379078507, + "grad_norm": 1.9952393770217896, + "learning_rate": 0.00015500725689404936, + "loss": 0.0625, + "step": 1710 + }, + { + "epoch": 0.452178498450992, + "grad_norm": 0.014283270575106144, + "learning_rate": 0.00015487531336587943, + "loss": 0.0037, + "step": 1715 + }, + { + "epoch": 0.453496803111199, + "grad_norm": 0.3897913098335266, + "learning_rate": 0.00015474336983770947, + "loss": 0.0304, + "step": 1720 + }, + { + "epoch": 0.454815107771406, + "grad_norm": 0.3730885684490204, + "learning_rate": 0.00015461142630953954, + "loss": 0.0115, + "step": 1725 + }, + { + "epoch": 0.45613341243161293, + "grad_norm": 0.035858724266290665, + "learning_rate": 0.00015447948278136958, + "loss": 0.0021, + "step": 1730 + }, + { + "epoch": 0.4574517170918199, + "grad_norm": 0.20589517056941986, + "learning_rate": 0.00015434753925319965, + "loss": 0.0132, + "step": 1735 + }, + { + "epoch": 0.4587700217520269, + "grad_norm": 0.004939342383295298, + "learning_rate": 0.00015421559572502972, + "loss": 0.0471, + "step": 1740 + }, + { + "epoch": 0.46008832641223385, + "grad_norm": 0.03493283689022064, + "learning_rate": 0.00015408365219685976, + "loss": 0.0062, + "step": 1745 + }, + { + "epoch": 0.46140663107244084, + "grad_norm": 0.045927103608846664, + "learning_rate": 0.0001539517086686898, + "loss": 0.0283, + "step": 1750 + }, + { + "epoch": 0.46272493573264784, + "grad_norm": 0.012629454955458641, + "learning_rate": 0.00015381976514051984, + "loss": 0.0133, + "step": 1755 + }, + { + "epoch": 0.46404324039285477, + "grad_norm": 0.8001697659492493, + "learning_rate": 0.0001536878216123499, + "loss": 0.0224, + "step": 1760 + }, + { + "epoch": 0.46536154505306176, + "grad_norm": 0.002036362886428833, + "learning_rate": 0.00015355587808417998, + "loss": 0.0066, + "step": 1765 + }, + { + "epoch": 0.46667984971326876, + "grad_norm": 1.0261330604553223, + "learning_rate": 0.00015342393455601002, + "loss": 0.191, + "step": 1770 + }, + { + "epoch": 0.4679981543734757, + "grad_norm": 0.3033429682254791, + "learning_rate": 0.0001532919910278401, + "loss": 0.0222, + "step": 1775 + }, + { + "epoch": 0.4693164590336827, + "grad_norm": 0.36911338567733765, + "learning_rate": 0.00015316004749967014, + "loss": 0.0363, + "step": 1780 + }, + { + "epoch": 0.4706347636938897, + "grad_norm": 0.0406811460852623, + "learning_rate": 0.0001530281039715002, + "loss": 0.0283, + "step": 1785 + }, + { + "epoch": 0.4719530683540966, + "grad_norm": 0.23334211111068726, + "learning_rate": 0.00015289616044333027, + "loss": 0.0274, + "step": 1790 + }, + { + "epoch": 0.4732713730143036, + "grad_norm": 0.013081169687211514, + "learning_rate": 0.00015276421691516032, + "loss": 0.0221, + "step": 1795 + }, + { + "epoch": 0.4745896776745106, + "grad_norm": 0.2480790615081787, + "learning_rate": 0.00015263227338699039, + "loss": 0.019, + "step": 1800 + }, + { + "epoch": 0.47590798233471754, + "grad_norm": 0.0373196005821228, + "learning_rate": 0.00015250032985882043, + "loss": 0.0292, + "step": 1805 + }, + { + "epoch": 0.47722628699492453, + "grad_norm": 0.004609994124621153, + "learning_rate": 0.0001523683863306505, + "loss": 0.0918, + "step": 1810 + }, + { + "epoch": 0.4785445916551315, + "grad_norm": 0.02370987832546234, + "learning_rate": 0.00015223644280248054, + "loss": 0.0462, + "step": 1815 + }, + { + "epoch": 0.47986289631533846, + "grad_norm": 0.05842221528291702, + "learning_rate": 0.0001521044992743106, + "loss": 0.0595, + "step": 1820 + }, + { + "epoch": 0.48118120097554545, + "grad_norm": 0.009685276076197624, + "learning_rate": 0.00015197255574614068, + "loss": 0.0074, + "step": 1825 + }, + { + "epoch": 0.48249950563575245, + "grad_norm": 0.8933250308036804, + "learning_rate": 0.00015184061221797072, + "loss": 0.0757, + "step": 1830 + }, + { + "epoch": 0.4838178102959594, + "grad_norm": 0.07075401395559311, + "learning_rate": 0.0001517086686898008, + "loss": 0.0226, + "step": 1835 + }, + { + "epoch": 0.4851361149561664, + "grad_norm": 0.732706606388092, + "learning_rate": 0.00015157672516163083, + "loss": 0.0161, + "step": 1840 + }, + { + "epoch": 0.48645441961637337, + "grad_norm": 1.1897023916244507, + "learning_rate": 0.0001514447816334609, + "loss": 0.0265, + "step": 1845 + }, + { + "epoch": 0.4877727242765803, + "grad_norm": 0.052572328597307205, + "learning_rate": 0.00015131283810529094, + "loss": 0.0094, + "step": 1850 + }, + { + "epoch": 0.4890910289367873, + "grad_norm": 0.08263898640871048, + "learning_rate": 0.00015118089457712098, + "loss": 0.0631, + "step": 1855 + }, + { + "epoch": 0.4904093335969943, + "grad_norm": 0.03225664421916008, + "learning_rate": 0.00015104895104895105, + "loss": 0.023, + "step": 1860 + }, + { + "epoch": 0.4917276382572012, + "grad_norm": 0.007935039699077606, + "learning_rate": 0.0001509170075207811, + "loss": 0.0039, + "step": 1865 + }, + { + "epoch": 0.4930459429174082, + "grad_norm": 0.00830796267837286, + "learning_rate": 0.00015078506399261116, + "loss": 0.007, + "step": 1870 + }, + { + "epoch": 0.4943642475776152, + "grad_norm": 0.08042234182357788, + "learning_rate": 0.00015065312046444123, + "loss": 0.0366, + "step": 1875 + }, + { + "epoch": 0.49568255223782215, + "grad_norm": 0.009092851541936398, + "learning_rate": 0.00015052117693627128, + "loss": 0.0107, + "step": 1880 + }, + { + "epoch": 0.49700085689802914, + "grad_norm": 0.2674141824245453, + "learning_rate": 0.00015038923340810135, + "loss": 0.0076, + "step": 1885 + }, + { + "epoch": 0.49831916155823613, + "grad_norm": 0.07694366574287415, + "learning_rate": 0.0001502572898799314, + "loss": 0.0252, + "step": 1890 + }, + { + "epoch": 0.49963746621844307, + "grad_norm": 0.5699467062950134, + "learning_rate": 0.00015012534635176146, + "loss": 0.0487, + "step": 1895 + }, + { + "epoch": 0.5009557708786501, + "grad_norm": 0.18800878524780273, + "learning_rate": 0.0001499934028235915, + "loss": 0.0183, + "step": 1900 + }, + { + "epoch": 0.5022740755388571, + "grad_norm": 0.019469989463686943, + "learning_rate": 0.00014986145929542157, + "loss": 0.0268, + "step": 1905 + }, + { + "epoch": 0.503592380199064, + "grad_norm": 0.01890506222844124, + "learning_rate": 0.00014972951576725164, + "loss": 0.0449, + "step": 1910 + }, + { + "epoch": 0.5049106848592709, + "grad_norm": 0.0006314461352303624, + "learning_rate": 0.00014959757223908168, + "loss": 0.0056, + "step": 1915 + }, + { + "epoch": 0.5062289895194779, + "grad_norm": 0.32654041051864624, + "learning_rate": 0.00014946562871091175, + "loss": 0.0256, + "step": 1920 + }, + { + "epoch": 0.5075472941796849, + "grad_norm": 0.7803483605384827, + "learning_rate": 0.0001493336851827418, + "loss": 0.0374, + "step": 1925 + }, + { + "epoch": 0.5088655988398919, + "grad_norm": 0.028441445901989937, + "learning_rate": 0.00014920174165457186, + "loss": 0.0161, + "step": 1930 + }, + { + "epoch": 0.5101839035000989, + "grad_norm": 0.028379200026392937, + "learning_rate": 0.00014906979812640193, + "loss": 0.0151, + "step": 1935 + }, + { + "epoch": 0.5115022081603059, + "grad_norm": 0.021159596741199493, + "learning_rate": 0.00014893785459823197, + "loss": 0.0303, + "step": 1940 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.24903325736522675, + "learning_rate": 0.000148805911070062, + "loss": 0.0076, + "step": 1945 + }, + { + "epoch": 0.5141388174807198, + "grad_norm": 0.007065301761031151, + "learning_rate": 0.00014867396754189206, + "loss": 0.022, + "step": 1950 + }, + { + "epoch": 0.5154571221409268, + "grad_norm": 0.004032329190522432, + "learning_rate": 0.00014854202401372212, + "loss": 0.0083, + "step": 1955 + }, + { + "epoch": 0.5167754268011338, + "grad_norm": 0.3045775592327118, + "learning_rate": 0.0001484100804855522, + "loss": 0.0113, + "step": 1960 + }, + { + "epoch": 0.5180937314613407, + "grad_norm": 0.36974939703941345, + "learning_rate": 0.00014827813695738224, + "loss": 0.0267, + "step": 1965 + }, + { + "epoch": 0.5194120361215477, + "grad_norm": 0.009729950688779354, + "learning_rate": 0.0001481461934292123, + "loss": 0.027, + "step": 1970 + }, + { + "epoch": 0.5207303407817546, + "grad_norm": 0.0013097926275804639, + "learning_rate": 0.00014801424990104235, + "loss": 0.003, + "step": 1975 + }, + { + "epoch": 0.5220486454419616, + "grad_norm": 0.0706263929605484, + "learning_rate": 0.00014788230637287242, + "loss": 0.0193, + "step": 1980 + }, + { + "epoch": 0.5233669501021686, + "grad_norm": 1.435702919960022, + "learning_rate": 0.00014775036284470249, + "loss": 0.0647, + "step": 1985 + }, + { + "epoch": 0.5246852547623756, + "grad_norm": 0.00661757867783308, + "learning_rate": 0.00014761841931653253, + "loss": 0.0373, + "step": 1990 + }, + { + "epoch": 0.5260035594225826, + "grad_norm": 0.12014541029930115, + "learning_rate": 0.0001474864757883626, + "loss": 0.0178, + "step": 1995 + }, + { + "epoch": 0.5273218640827896, + "grad_norm": 1.0549248456954956, + "learning_rate": 0.00014735453226019264, + "loss": 0.0191, + "step": 2000 + }, + { + "epoch": 0.5273218640827896, + "eval_loss": 0.037292081862688065, + "eval_runtime": 454.3033, + "eval_samples_per_second": 7.422, + "eval_steps_per_second": 3.711, + "step": 2000 + }, + { + "epoch": 0.5286401687429965, + "grad_norm": 0.47634151577949524, + "learning_rate": 0.0001472225887320227, + "loss": 0.0404, + "step": 2005 + }, + { + "epoch": 0.5299584734032035, + "grad_norm": 0.006752463988959789, + "learning_rate": 0.00014709064520385275, + "loss": 0.034, + "step": 2010 + }, + { + "epoch": 0.5312767780634104, + "grad_norm": 0.20780125260353088, + "learning_rate": 0.00014695870167568282, + "loss": 0.0421, + "step": 2015 + }, + { + "epoch": 0.5325950827236174, + "grad_norm": 0.010941066779196262, + "learning_rate": 0.0001468267581475129, + "loss": 0.0086, + "step": 2020 + }, + { + "epoch": 0.5339133873838244, + "grad_norm": 0.3439581096172333, + "learning_rate": 0.00014669481461934293, + "loss": 0.0187, + "step": 2025 + }, + { + "epoch": 0.5352316920440314, + "grad_norm": 0.14961636066436768, + "learning_rate": 0.000146562871091173, + "loss": 0.0504, + "step": 2030 + }, + { + "epoch": 0.5365499967042383, + "grad_norm": 0.0044641937129199505, + "learning_rate": 0.00014643092756300304, + "loss": 0.0134, + "step": 2035 + }, + { + "epoch": 0.5378683013644453, + "grad_norm": 0.14088386297225952, + "learning_rate": 0.0001462989840348331, + "loss": 0.0096, + "step": 2040 + }, + { + "epoch": 0.5391866060246523, + "grad_norm": 0.48116979002952576, + "learning_rate": 0.00014616704050666315, + "loss": 0.0124, + "step": 2045 + }, + { + "epoch": 0.5405049106848593, + "grad_norm": 0.3688766360282898, + "learning_rate": 0.0001460350969784932, + "loss": 0.0226, + "step": 2050 + }, + { + "epoch": 0.5418232153450663, + "grad_norm": 0.002938181860372424, + "learning_rate": 0.00014590315345032326, + "loss": 0.0267, + "step": 2055 + }, + { + "epoch": 0.5431415200052733, + "grad_norm": 0.3335214853286743, + "learning_rate": 0.0001457712099221533, + "loss": 0.0367, + "step": 2060 + }, + { + "epoch": 0.5444598246654802, + "grad_norm": 0.004644686821848154, + "learning_rate": 0.00014563926639398338, + "loss": 0.0121, + "step": 2065 + }, + { + "epoch": 0.5457781293256871, + "grad_norm": 0.19505545496940613, + "learning_rate": 0.00014550732286581345, + "loss": 0.0591, + "step": 2070 + }, + { + "epoch": 0.5470964339858941, + "grad_norm": 0.018028756603598595, + "learning_rate": 0.0001453753793376435, + "loss": 0.0131, + "step": 2075 + }, + { + "epoch": 0.5484147386461011, + "grad_norm": 0.045639291405677795, + "learning_rate": 0.00014524343580947356, + "loss": 0.0443, + "step": 2080 + }, + { + "epoch": 0.5497330433063081, + "grad_norm": 0.727981686592102, + "learning_rate": 0.0001451114922813036, + "loss": 0.0205, + "step": 2085 + }, + { + "epoch": 0.5510513479665151, + "grad_norm": 0.03766491636633873, + "learning_rate": 0.00014497954875313367, + "loss": 0.0067, + "step": 2090 + }, + { + "epoch": 0.552369652626722, + "grad_norm": 0.1911504715681076, + "learning_rate": 0.0001448476052249637, + "loss": 0.0397, + "step": 2095 + }, + { + "epoch": 0.553687957286929, + "grad_norm": 0.08238353580236435, + "learning_rate": 0.00014471566169679378, + "loss": 0.0513, + "step": 2100 + }, + { + "epoch": 0.555006261947136, + "grad_norm": 0.06317206472158432, + "learning_rate": 0.00014458371816862385, + "loss": 0.0178, + "step": 2105 + }, + { + "epoch": 0.556324566607343, + "grad_norm": 0.0652734637260437, + "learning_rate": 0.0001444517746404539, + "loss": 0.0184, + "step": 2110 + }, + { + "epoch": 0.55764287126755, + "grad_norm": 0.05471858009696007, + "learning_rate": 0.00014431983111228396, + "loss": 0.0089, + "step": 2115 + }, + { + "epoch": 0.558961175927757, + "grad_norm": 0.005062670446932316, + "learning_rate": 0.000144187887584114, + "loss": 0.0052, + "step": 2120 + }, + { + "epoch": 0.5602794805879638, + "grad_norm": 0.06337414681911469, + "learning_rate": 0.00014405594405594407, + "loss": 0.053, + "step": 2125 + }, + { + "epoch": 0.5615977852481708, + "grad_norm": 0.33745357394218445, + "learning_rate": 0.00014392400052777414, + "loss": 0.0166, + "step": 2130 + }, + { + "epoch": 0.5629160899083778, + "grad_norm": 0.7382741570472717, + "learning_rate": 0.00014379205699960418, + "loss": 0.0191, + "step": 2135 + }, + { + "epoch": 0.5642343945685848, + "grad_norm": 0.007551972754299641, + "learning_rate": 0.00014366011347143425, + "loss": 0.0022, + "step": 2140 + }, + { + "epoch": 0.5655526992287918, + "grad_norm": 0.6260896921157837, + "learning_rate": 0.00014352816994326427, + "loss": 0.0095, + "step": 2145 + }, + { + "epoch": 0.5668710038889987, + "grad_norm": 0.11619322001934052, + "learning_rate": 0.00014339622641509434, + "loss": 0.015, + "step": 2150 + }, + { + "epoch": 0.5681893085492057, + "grad_norm": 1.1440670490264893, + "learning_rate": 0.0001432642828869244, + "loss": 0.1343, + "step": 2155 + }, + { + "epoch": 0.5695076132094127, + "grad_norm": 1.1793878078460693, + "learning_rate": 0.00014313233935875445, + "loss": 0.0968, + "step": 2160 + }, + { + "epoch": 0.5708259178696197, + "grad_norm": 0.6865736842155457, + "learning_rate": 0.00014300039583058452, + "loss": 0.0195, + "step": 2165 + }, + { + "epoch": 0.5721442225298267, + "grad_norm": 0.140816792845726, + "learning_rate": 0.00014286845230241456, + "loss": 0.0761, + "step": 2170 + }, + { + "epoch": 0.5734625271900337, + "grad_norm": 0.04071786254644394, + "learning_rate": 0.00014273650877424463, + "loss": 0.0193, + "step": 2175 + }, + { + "epoch": 0.5747808318502405, + "grad_norm": 0.044617727398872375, + "learning_rate": 0.0001426045652460747, + "loss": 0.0112, + "step": 2180 + }, + { + "epoch": 0.5760991365104475, + "grad_norm": 0.11001799255609512, + "learning_rate": 0.00014247262171790474, + "loss": 0.0039, + "step": 2185 + }, + { + "epoch": 0.5774174411706545, + "grad_norm": 0.0036315324250608683, + "learning_rate": 0.0001423406781897348, + "loss": 0.0038, + "step": 2190 + }, + { + "epoch": 0.5787357458308615, + "grad_norm": 0.9866570830345154, + "learning_rate": 0.00014220873466156485, + "loss": 0.025, + "step": 2195 + }, + { + "epoch": 0.5800540504910685, + "grad_norm": 0.023570384830236435, + "learning_rate": 0.00014207679113339492, + "loss": 0.0468, + "step": 2200 + }, + { + "epoch": 0.5813723551512755, + "grad_norm": 0.20010559260845184, + "learning_rate": 0.00014194484760522496, + "loss": 0.0198, + "step": 2205 + }, + { + "epoch": 0.5826906598114824, + "grad_norm": 0.06153270602226257, + "learning_rate": 0.00014181290407705503, + "loss": 0.0764, + "step": 2210 + }, + { + "epoch": 0.5840089644716894, + "grad_norm": 0.033162448555231094, + "learning_rate": 0.0001416809605488851, + "loss": 0.028, + "step": 2215 + }, + { + "epoch": 0.5853272691318964, + "grad_norm": 0.428382933139801, + "learning_rate": 0.00014154901702071514, + "loss": 0.0652, + "step": 2220 + }, + { + "epoch": 0.5866455737921034, + "grad_norm": 0.25004762411117554, + "learning_rate": 0.0001414170734925452, + "loss": 0.0411, + "step": 2225 + }, + { + "epoch": 0.5879638784523104, + "grad_norm": 0.22649863362312317, + "learning_rate": 0.00014128512996437525, + "loss": 0.0517, + "step": 2230 + }, + { + "epoch": 0.5892821831125173, + "grad_norm": 0.035932112485170364, + "learning_rate": 0.00014115318643620532, + "loss": 0.015, + "step": 2235 + }, + { + "epoch": 0.5906004877727242, + "grad_norm": 0.3800172507762909, + "learning_rate": 0.00014102124290803536, + "loss": 0.0324, + "step": 2240 + }, + { + "epoch": 0.5919187924329312, + "grad_norm": 0.6974118947982788, + "learning_rate": 0.0001408892993798654, + "loss": 0.0216, + "step": 2245 + }, + { + "epoch": 0.5932370970931382, + "grad_norm": 0.15472032129764557, + "learning_rate": 0.00014075735585169548, + "loss": 0.0164, + "step": 2250 + }, + { + "epoch": 0.5945554017533452, + "grad_norm": 0.015000814571976662, + "learning_rate": 0.00014062541232352552, + "loss": 0.0395, + "step": 2255 + }, + { + "epoch": 0.5958737064135522, + "grad_norm": 0.052086081355810165, + "learning_rate": 0.0001404934687953556, + "loss": 0.0032, + "step": 2260 + }, + { + "epoch": 0.5971920110737592, + "grad_norm": 0.004600350745022297, + "learning_rate": 0.00014036152526718566, + "loss": 0.0056, + "step": 2265 + }, + { + "epoch": 0.5985103157339661, + "grad_norm": 0.4940958321094513, + "learning_rate": 0.0001402295817390157, + "loss": 0.0206, + "step": 2270 + }, + { + "epoch": 0.5998286203941731, + "grad_norm": 0.09658394008874893, + "learning_rate": 0.00014009763821084577, + "loss": 0.0052, + "step": 2275 + }, + { + "epoch": 0.60114692505438, + "grad_norm": 0.00020539117394946516, + "learning_rate": 0.0001399656946826758, + "loss": 0.087, + "step": 2280 + }, + { + "epoch": 0.602465229714587, + "grad_norm": 0.1871018409729004, + "learning_rate": 0.00013983375115450588, + "loss": 0.0812, + "step": 2285 + }, + { + "epoch": 0.603783534374794, + "grad_norm": 0.02583954855799675, + "learning_rate": 0.00013970180762633592, + "loss": 0.0232, + "step": 2290 + }, + { + "epoch": 0.605101839035001, + "grad_norm": 1.2103784084320068, + "learning_rate": 0.000139569864098166, + "loss": 0.0151, + "step": 2295 + }, + { + "epoch": 0.6064201436952079, + "grad_norm": 0.023514943197369576, + "learning_rate": 0.00013943792056999606, + "loss": 0.0193, + "step": 2300 + }, + { + "epoch": 0.6077384483554149, + "grad_norm": 0.0076395305804908276, + "learning_rate": 0.0001393059770418261, + "loss": 0.0379, + "step": 2305 + }, + { + "epoch": 0.6090567530156219, + "grad_norm": 0.12412039190530777, + "learning_rate": 0.00013917403351365617, + "loss": 0.0095, + "step": 2310 + }, + { + "epoch": 0.6103750576758289, + "grad_norm": 0.021904783323407173, + "learning_rate": 0.0001390420899854862, + "loss": 0.0166, + "step": 2315 + }, + { + "epoch": 0.6116933623360359, + "grad_norm": 0.004012851510196924, + "learning_rate": 0.00013891014645731628, + "loss": 0.0103, + "step": 2320 + }, + { + "epoch": 0.6130116669962429, + "grad_norm": 0.007267913781106472, + "learning_rate": 0.00013877820292914635, + "loss": 0.0708, + "step": 2325 + }, + { + "epoch": 0.6143299716564498, + "grad_norm": 0.10363642126321793, + "learning_rate": 0.0001386462594009764, + "loss": 0.0473, + "step": 2330 + }, + { + "epoch": 0.6156482763166568, + "grad_norm": 0.04899830371141434, + "learning_rate": 0.00013851431587280646, + "loss": 0.0283, + "step": 2335 + }, + { + "epoch": 0.6169665809768637, + "grad_norm": 0.39460498094558716, + "learning_rate": 0.0001383823723446365, + "loss": 0.0597, + "step": 2340 + }, + { + "epoch": 0.6182848856370707, + "grad_norm": 0.04092290997505188, + "learning_rate": 0.00013825042881646655, + "loss": 0.0167, + "step": 2345 + }, + { + "epoch": 0.6196031902972777, + "grad_norm": 0.2781132161617279, + "learning_rate": 0.00013811848528829662, + "loss": 0.0097, + "step": 2350 + }, + { + "epoch": 0.6209214949574847, + "grad_norm": 0.041443537920713425, + "learning_rate": 0.00013798654176012666, + "loss": 0.0226, + "step": 2355 + }, + { + "epoch": 0.6222397996176916, + "grad_norm": 0.1242462694644928, + "learning_rate": 0.00013785459823195673, + "loss": 0.0055, + "step": 2360 + }, + { + "epoch": 0.6235581042778986, + "grad_norm": 0.4440467357635498, + "learning_rate": 0.00013772265470378677, + "loss": 0.049, + "step": 2365 + }, + { + "epoch": 0.6248764089381056, + "grad_norm": 0.014354427345097065, + "learning_rate": 0.00013759071117561684, + "loss": 0.0327, + "step": 2370 + }, + { + "epoch": 0.6261947135983126, + "grad_norm": 0.011539973318576813, + "learning_rate": 0.0001374587676474469, + "loss": 0.0222, + "step": 2375 + }, + { + "epoch": 0.6275130182585196, + "grad_norm": 0.23539051413536072, + "learning_rate": 0.00013732682411927695, + "loss": 0.0816, + "step": 2380 + }, + { + "epoch": 0.6288313229187266, + "grad_norm": 0.26793941855430603, + "learning_rate": 0.00013719488059110702, + "loss": 0.0325, + "step": 2385 + }, + { + "epoch": 0.6301496275789334, + "grad_norm": 0.01662217453122139, + "learning_rate": 0.00013706293706293706, + "loss": 0.0221, + "step": 2390 + }, + { + "epoch": 0.6314679322391404, + "grad_norm": 0.30669671297073364, + "learning_rate": 0.00013693099353476713, + "loss": 0.026, + "step": 2395 + }, + { + "epoch": 0.6327862368993474, + "grad_norm": 0.03350894898176193, + "learning_rate": 0.00013679905000659717, + "loss": 0.0072, + "step": 2400 + }, + { + "epoch": 0.6341045415595544, + "grad_norm": 0.014983875676989555, + "learning_rate": 0.00013666710647842724, + "loss": 0.049, + "step": 2405 + }, + { + "epoch": 0.6354228462197614, + "grad_norm": 1.8989384174346924, + "learning_rate": 0.0001365351629502573, + "loss": 0.0335, + "step": 2410 + }, + { + "epoch": 0.6367411508799684, + "grad_norm": 0.030135562643408775, + "learning_rate": 0.00013640321942208735, + "loss": 0.0051, + "step": 2415 + }, + { + "epoch": 0.6380594555401753, + "grad_norm": 0.02079075388610363, + "learning_rate": 0.00013627127589391742, + "loss": 0.0138, + "step": 2420 + }, + { + "epoch": 0.6393777602003823, + "grad_norm": 0.06065403297543526, + "learning_rate": 0.00013613933236574746, + "loss": 0.0357, + "step": 2425 + }, + { + "epoch": 0.6406960648605893, + "grad_norm": 0.2980937659740448, + "learning_rate": 0.00013600738883757753, + "loss": 0.0138, + "step": 2430 + }, + { + "epoch": 0.6420143695207963, + "grad_norm": 0.4820438623428345, + "learning_rate": 0.00013587544530940758, + "loss": 0.01, + "step": 2435 + }, + { + "epoch": 0.6433326741810033, + "grad_norm": 0.005618259310722351, + "learning_rate": 0.00013574350178123765, + "loss": 0.0052, + "step": 2440 + }, + { + "epoch": 0.6446509788412103, + "grad_norm": 0.7173821926116943, + "learning_rate": 0.0001356115582530677, + "loss": 0.0133, + "step": 2445 + }, + { + "epoch": 0.6459692835014171, + "grad_norm": 0.0053142281249165535, + "learning_rate": 0.00013547961472489773, + "loss": 0.0045, + "step": 2450 + }, + { + "epoch": 0.6472875881616241, + "grad_norm": 0.06118829548358917, + "learning_rate": 0.0001353476711967278, + "loss": 0.056, + "step": 2455 + }, + { + "epoch": 0.6486058928218311, + "grad_norm": 3.5878078937530518, + "learning_rate": 0.00013521572766855787, + "loss": 0.0232, + "step": 2460 + }, + { + "epoch": 0.6499241974820381, + "grad_norm": 0.004911276511847973, + "learning_rate": 0.0001350837841403879, + "loss": 0.0074, + "step": 2465 + }, + { + "epoch": 0.6512425021422451, + "grad_norm": 0.0028026222717016935, + "learning_rate": 0.00013495184061221798, + "loss": 0.0782, + "step": 2470 + }, + { + "epoch": 0.6525608068024521, + "grad_norm": 0.7317615747451782, + "learning_rate": 0.00013481989708404802, + "loss": 0.0222, + "step": 2475 + }, + { + "epoch": 0.653879111462659, + "grad_norm": 0.01835751160979271, + "learning_rate": 0.0001346879535558781, + "loss": 0.0661, + "step": 2480 + }, + { + "epoch": 0.655197416122866, + "grad_norm": 0.03598962351679802, + "learning_rate": 0.00013455601002770813, + "loss": 0.0395, + "step": 2485 + }, + { + "epoch": 0.656515720783073, + "grad_norm": 0.013886351138353348, + "learning_rate": 0.0001344240664995382, + "loss": 0.0156, + "step": 2490 + }, + { + "epoch": 0.65783402544328, + "grad_norm": 5.741530895233154, + "learning_rate": 0.00013429212297136827, + "loss": 0.0317, + "step": 2495 + }, + { + "epoch": 0.659152330103487, + "grad_norm": 0.20793496072292328, + "learning_rate": 0.0001341601794431983, + "loss": 0.0072, + "step": 2500 + }, + { + "epoch": 0.659152330103487, + "eval_loss": 0.0300898440182209, + "eval_runtime": 453.0554, + "eval_samples_per_second": 7.443, + "eval_steps_per_second": 3.721, + "step": 2500 + }, + { + "epoch": 0.6604706347636939, + "grad_norm": 0.03460961952805519, + "learning_rate": 0.00013402823591502838, + "loss": 0.0097, + "step": 2505 + }, + { + "epoch": 0.6617889394239008, + "grad_norm": 0.31785696744918823, + "learning_rate": 0.00013389629238685842, + "loss": 0.0303, + "step": 2510 + }, + { + "epoch": 0.6631072440841078, + "grad_norm": 0.4273851215839386, + "learning_rate": 0.0001337643488586885, + "loss": 0.0499, + "step": 2515 + }, + { + "epoch": 0.6644255487443148, + "grad_norm": 0.02236153744161129, + "learning_rate": 0.00013363240533051856, + "loss": 0.0069, + "step": 2520 + }, + { + "epoch": 0.6657438534045218, + "grad_norm": 0.1592864990234375, + "learning_rate": 0.0001335004618023486, + "loss": 0.0326, + "step": 2525 + }, + { + "epoch": 0.6670621580647288, + "grad_norm": 0.029961545020341873, + "learning_rate": 0.00013336851827417867, + "loss": 0.0178, + "step": 2530 + }, + { + "epoch": 0.6683804627249358, + "grad_norm": 0.03120764158666134, + "learning_rate": 0.00013323657474600872, + "loss": 0.115, + "step": 2535 + }, + { + "epoch": 0.6696987673851427, + "grad_norm": 0.01060028001666069, + "learning_rate": 0.00013310463121783879, + "loss": 0.0036, + "step": 2540 + }, + { + "epoch": 0.6710170720453497, + "grad_norm": 0.053470809012651443, + "learning_rate": 0.00013297268768966883, + "loss": 0.0079, + "step": 2545 + }, + { + "epoch": 0.6723353767055567, + "grad_norm": 0.022777097299695015, + "learning_rate": 0.00013284074416149887, + "loss": 0.0078, + "step": 2550 + }, + { + "epoch": 0.6736536813657636, + "grad_norm": 0.0548521913588047, + "learning_rate": 0.00013270880063332894, + "loss": 0.0503, + "step": 2555 + }, + { + "epoch": 0.6749719860259706, + "grad_norm": 0.02028457075357437, + "learning_rate": 0.00013257685710515898, + "loss": 0.0096, + "step": 2560 + }, + { + "epoch": 0.6762902906861776, + "grad_norm": 0.01569107361137867, + "learning_rate": 0.00013244491357698905, + "loss": 0.008, + "step": 2565 + }, + { + "epoch": 0.6776085953463845, + "grad_norm": 0.00743742985650897, + "learning_rate": 0.00013231297004881912, + "loss": 0.005, + "step": 2570 + }, + { + "epoch": 0.6789269000065915, + "grad_norm": 0.025164416059851646, + "learning_rate": 0.00013218102652064916, + "loss": 0.018, + "step": 2575 + }, + { + "epoch": 0.6802452046667985, + "grad_norm": 0.3653188645839691, + "learning_rate": 0.00013204908299247923, + "loss": 0.0295, + "step": 2580 + }, + { + "epoch": 0.6815635093270055, + "grad_norm": 0.685422956943512, + "learning_rate": 0.00013191713946430927, + "loss": 0.0335, + "step": 2585 + }, + { + "epoch": 0.6828818139872125, + "grad_norm": 0.675740122795105, + "learning_rate": 0.00013178519593613934, + "loss": 0.0592, + "step": 2590 + }, + { + "epoch": 0.6842001186474194, + "grad_norm": 0.10513252764940262, + "learning_rate": 0.00013165325240796938, + "loss": 0.0353, + "step": 2595 + }, + { + "epoch": 0.6855184233076264, + "grad_norm": 0.43512973189353943, + "learning_rate": 0.00013152130887979945, + "loss": 0.0142, + "step": 2600 + }, + { + "epoch": 0.6868367279678333, + "grad_norm": 0.029436839744448662, + "learning_rate": 0.00013138936535162952, + "loss": 0.0042, + "step": 2605 + }, + { + "epoch": 0.6881550326280403, + "grad_norm": 0.5607122778892517, + "learning_rate": 0.00013125742182345957, + "loss": 0.0184, + "step": 2610 + }, + { + "epoch": 0.6894733372882473, + "grad_norm": 0.11365406215190887, + "learning_rate": 0.00013112547829528963, + "loss": 0.006, + "step": 2615 + }, + { + "epoch": 0.6907916419484543, + "grad_norm": 0.047227244824171066, + "learning_rate": 0.00013099353476711968, + "loss": 0.008, + "step": 2620 + }, + { + "epoch": 0.6921099466086612, + "grad_norm": 0.0005877618095837533, + "learning_rate": 0.00013086159123894975, + "loss": 0.0286, + "step": 2625 + }, + { + "epoch": 0.6934282512688682, + "grad_norm": 0.010759112425148487, + "learning_rate": 0.0001307296477107798, + "loss": 0.0062, + "step": 2630 + }, + { + "epoch": 0.6947465559290752, + "grad_norm": 0.07117745280265808, + "learning_rate": 0.00013059770418260986, + "loss": 0.0891, + "step": 2635 + }, + { + "epoch": 0.6960648605892822, + "grad_norm": 0.0639057606458664, + "learning_rate": 0.00013046576065443993, + "loss": 0.0072, + "step": 2640 + }, + { + "epoch": 0.6973831652494892, + "grad_norm": 0.027350090444087982, + "learning_rate": 0.00013033381712626994, + "loss": 0.0103, + "step": 2645 + }, + { + "epoch": 0.6987014699096962, + "grad_norm": 0.015336195938289165, + "learning_rate": 0.0001302018735981, + "loss": 0.0041, + "step": 2650 + }, + { + "epoch": 0.700019774569903, + "grad_norm": 1.0650830268859863, + "learning_rate": 0.00013006993006993008, + "loss": 0.0443, + "step": 2655 + }, + { + "epoch": 0.70133807923011, + "grad_norm": 0.019073212519288063, + "learning_rate": 0.00012993798654176012, + "loss": 0.0331, + "step": 2660 + }, + { + "epoch": 0.702656383890317, + "grad_norm": 0.10109209269285202, + "learning_rate": 0.0001298060430135902, + "loss": 0.0054, + "step": 2665 + }, + { + "epoch": 0.703974688550524, + "grad_norm": 0.03528957813978195, + "learning_rate": 0.00012967409948542023, + "loss": 0.0427, + "step": 2670 + }, + { + "epoch": 0.705292993210731, + "grad_norm": 0.03577788919210434, + "learning_rate": 0.0001295421559572503, + "loss": 0.023, + "step": 2675 + }, + { + "epoch": 0.706611297870938, + "grad_norm": 0.5576180815696716, + "learning_rate": 0.00012941021242908034, + "loss": 0.0416, + "step": 2680 + }, + { + "epoch": 0.7079296025311449, + "grad_norm": 0.017131298780441284, + "learning_rate": 0.0001292782689009104, + "loss": 0.0235, + "step": 2685 + }, + { + "epoch": 0.7092479071913519, + "grad_norm": 0.8517888784408569, + "learning_rate": 0.00012914632537274048, + "loss": 0.0168, + "step": 2690 + }, + { + "epoch": 0.7105662118515589, + "grad_norm": 0.23812156915664673, + "learning_rate": 0.00012901438184457052, + "loss": 0.0483, + "step": 2695 + }, + { + "epoch": 0.7118845165117659, + "grad_norm": 0.11746613681316376, + "learning_rate": 0.0001288824383164006, + "loss": 0.0255, + "step": 2700 + }, + { + "epoch": 0.7132028211719729, + "grad_norm": 0.20089928805828094, + "learning_rate": 0.00012875049478823064, + "loss": 0.0267, + "step": 2705 + }, + { + "epoch": 0.7145211258321799, + "grad_norm": 0.8301129937171936, + "learning_rate": 0.0001286185512600607, + "loss": 0.016, + "step": 2710 + }, + { + "epoch": 0.7158394304923867, + "grad_norm": 0.01838674768805504, + "learning_rate": 0.00012848660773189077, + "loss": 0.0229, + "step": 2715 + }, + { + "epoch": 0.7171577351525937, + "grad_norm": 0.03670337051153183, + "learning_rate": 0.00012835466420372082, + "loss": 0.038, + "step": 2720 + }, + { + "epoch": 0.7184760398128007, + "grad_norm": 0.0452633760869503, + "learning_rate": 0.00012822272067555089, + "loss": 0.0622, + "step": 2725 + }, + { + "epoch": 0.7197943444730077, + "grad_norm": 0.09503110498189926, + "learning_rate": 0.00012809077714738093, + "loss": 0.0209, + "step": 2730 + }, + { + "epoch": 0.7211126491332147, + "grad_norm": 1.0327308177947998, + "learning_rate": 0.000127958833619211, + "loss": 0.0361, + "step": 2735 + }, + { + "epoch": 0.7224309537934217, + "grad_norm": 1.0049290657043457, + "learning_rate": 0.00012782689009104104, + "loss": 0.0365, + "step": 2740 + }, + { + "epoch": 0.7237492584536286, + "grad_norm": 0.029774073511362076, + "learning_rate": 0.00012769494656287108, + "loss": 0.0257, + "step": 2745 + }, + { + "epoch": 0.7250675631138356, + "grad_norm": 0.20974040031433105, + "learning_rate": 0.00012756300303470115, + "loss": 0.0542, + "step": 2750 + }, + { + "epoch": 0.7263858677740426, + "grad_norm": 0.8153854608535767, + "learning_rate": 0.0001274310595065312, + "loss": 0.0216, + "step": 2755 + }, + { + "epoch": 0.7277041724342496, + "grad_norm": 0.4393698573112488, + "learning_rate": 0.00012729911597836126, + "loss": 0.0451, + "step": 2760 + }, + { + "epoch": 0.7290224770944566, + "grad_norm": 0.06990349292755127, + "learning_rate": 0.00012716717245019133, + "loss": 0.03, + "step": 2765 + }, + { + "epoch": 0.7303407817546635, + "grad_norm": 0.32689470052719116, + "learning_rate": 0.00012703522892202137, + "loss": 0.0263, + "step": 2770 + }, + { + "epoch": 0.7316590864148704, + "grad_norm": 0.026600876823067665, + "learning_rate": 0.00012690328539385144, + "loss": 0.0404, + "step": 2775 + }, + { + "epoch": 0.7329773910750774, + "grad_norm": 0.11228257417678833, + "learning_rate": 0.00012677134186568148, + "loss": 0.0224, + "step": 2780 + }, + { + "epoch": 0.7342956957352844, + "grad_norm": 0.6469443440437317, + "learning_rate": 0.00012663939833751155, + "loss": 0.0178, + "step": 2785 + }, + { + "epoch": 0.7356140003954914, + "grad_norm": 0.020773250609636307, + "learning_rate": 0.0001265074548093416, + "loss": 0.011, + "step": 2790 + }, + { + "epoch": 0.7369323050556984, + "grad_norm": 0.7378728985786438, + "learning_rate": 0.00012637551128117167, + "loss": 0.0227, + "step": 2795 + }, + { + "epoch": 0.7382506097159054, + "grad_norm": 0.008189595304429531, + "learning_rate": 0.00012624356775300173, + "loss": 0.0892, + "step": 2800 + }, + { + "epoch": 0.7395689143761123, + "grad_norm": 0.031633853912353516, + "learning_rate": 0.00012611162422483178, + "loss": 0.0093, + "step": 2805 + }, + { + "epoch": 0.7408872190363193, + "grad_norm": 0.5078475475311279, + "learning_rate": 0.00012597968069666185, + "loss": 0.0567, + "step": 2810 + }, + { + "epoch": 0.7422055236965263, + "grad_norm": 0.21766887605190277, + "learning_rate": 0.0001258477371684919, + "loss": 0.0485, + "step": 2815 + }, + { + "epoch": 0.7435238283567333, + "grad_norm": 0.3029612898826599, + "learning_rate": 0.00012571579364032196, + "loss": 0.032, + "step": 2820 + }, + { + "epoch": 0.7448421330169402, + "grad_norm": 1.2135159969329834, + "learning_rate": 0.00012558385011215203, + "loss": 0.0139, + "step": 2825 + }, + { + "epoch": 0.7461604376771472, + "grad_norm": 0.016875172033905983, + "learning_rate": 0.00012545190658398207, + "loss": 0.0323, + "step": 2830 + }, + { + "epoch": 0.7474787423373541, + "grad_norm": 0.08923230320215225, + "learning_rate": 0.00012531996305581214, + "loss": 0.0343, + "step": 2835 + }, + { + "epoch": 0.7487970469975611, + "grad_norm": 0.2958766520023346, + "learning_rate": 0.00012518801952764215, + "loss": 0.0431, + "step": 2840 + }, + { + "epoch": 0.7501153516577681, + "grad_norm": 0.7344386577606201, + "learning_rate": 0.00012505607599947222, + "loss": 0.0389, + "step": 2845 + }, + { + "epoch": 0.7514336563179751, + "grad_norm": 0.03681635856628418, + "learning_rate": 0.0001249241324713023, + "loss": 0.0258, + "step": 2850 + }, + { + "epoch": 0.7527519609781821, + "grad_norm": 0.22866861522197723, + "learning_rate": 0.00012479218894313233, + "loss": 0.0223, + "step": 2855 + }, + { + "epoch": 0.7540702656383891, + "grad_norm": 0.029770435765385628, + "learning_rate": 0.0001246602454149624, + "loss": 0.0205, + "step": 2860 + }, + { + "epoch": 0.755388570298596, + "grad_norm": 0.011845707893371582, + "learning_rate": 0.00012452830188679244, + "loss": 0.0252, + "step": 2865 + }, + { + "epoch": 0.756706874958803, + "grad_norm": 0.06696149706840515, + "learning_rate": 0.00012439635835862251, + "loss": 0.0166, + "step": 2870 + }, + { + "epoch": 0.75802517961901, + "grad_norm": 0.01653144136071205, + "learning_rate": 0.00012426441483045256, + "loss": 0.0487, + "step": 2875 + }, + { + "epoch": 0.7593434842792169, + "grad_norm": 0.031312476843595505, + "learning_rate": 0.00012413247130228263, + "loss": 0.0155, + "step": 2880 + }, + { + "epoch": 0.7606617889394239, + "grad_norm": 0.011625733226537704, + "learning_rate": 0.0001240005277741127, + "loss": 0.0333, + "step": 2885 + }, + { + "epoch": 0.7619800935996309, + "grad_norm": 0.012089414522051811, + "learning_rate": 0.00012386858424594274, + "loss": 0.003, + "step": 2890 + }, + { + "epoch": 0.7632983982598378, + "grad_norm": 0.3012307584285736, + "learning_rate": 0.0001237366407177728, + "loss": 0.0172, + "step": 2895 + }, + { + "epoch": 0.7646167029200448, + "grad_norm": 0.31575000286102295, + "learning_rate": 0.00012360469718960285, + "loss": 0.0409, + "step": 2900 + }, + { + "epoch": 0.7659350075802518, + "grad_norm": 0.009794364683330059, + "learning_rate": 0.00012347275366143292, + "loss": 0.0214, + "step": 2905 + }, + { + "epoch": 0.7672533122404588, + "grad_norm": 0.5973085165023804, + "learning_rate": 0.00012334081013326299, + "loss": 0.0245, + "step": 2910 + }, + { + "epoch": 0.7685716169006658, + "grad_norm": 0.019750040024518967, + "learning_rate": 0.00012320886660509303, + "loss": 0.0063, + "step": 2915 + }, + { + "epoch": 0.7698899215608728, + "grad_norm": 0.06402858346700668, + "learning_rate": 0.0001230769230769231, + "loss": 0.0444, + "step": 2920 + }, + { + "epoch": 0.7712082262210797, + "grad_norm": 0.02876671403646469, + "learning_rate": 0.00012294497954875314, + "loss": 0.0103, + "step": 2925 + }, + { + "epoch": 0.7725265308812866, + "grad_norm": 0.6962207555770874, + "learning_rate": 0.0001228130360205832, + "loss": 0.0318, + "step": 2930 + }, + { + "epoch": 0.7738448355414936, + "grad_norm": 0.006536522414535284, + "learning_rate": 0.00012268109249241325, + "loss": 0.0096, + "step": 2935 + }, + { + "epoch": 0.7751631402017006, + "grad_norm": 0.07097168266773224, + "learning_rate": 0.0001225491489642433, + "loss": 0.0174, + "step": 2940 + }, + { + "epoch": 0.7764814448619076, + "grad_norm": 0.042360126972198486, + "learning_rate": 0.00012241720543607336, + "loss": 0.0158, + "step": 2945 + }, + { + "epoch": 0.7777997495221146, + "grad_norm": 0.01159572321921587, + "learning_rate": 0.0001222852619079034, + "loss": 0.0265, + "step": 2950 + }, + { + "epoch": 0.7791180541823215, + "grad_norm": 0.38408163189888, + "learning_rate": 0.00012215331837973347, + "loss": 0.0233, + "step": 2955 + }, + { + "epoch": 0.7804363588425285, + "grad_norm": 0.15588605403900146, + "learning_rate": 0.00012202137485156353, + "loss": 0.0041, + "step": 2960 + }, + { + "epoch": 0.7817546635027355, + "grad_norm": 0.006892362609505653, + "learning_rate": 0.00012188943132339358, + "loss": 0.0026, + "step": 2965 + }, + { + "epoch": 0.7830729681629425, + "grad_norm": 0.030915727838873863, + "learning_rate": 0.00012175748779522364, + "loss": 0.0028, + "step": 2970 + }, + { + "epoch": 0.7843912728231495, + "grad_norm": 0.8151025772094727, + "learning_rate": 0.00012162554426705371, + "loss": 0.0429, + "step": 2975 + }, + { + "epoch": 0.7857095774833565, + "grad_norm": 0.6765475273132324, + "learning_rate": 0.00012149360073888377, + "loss": 0.0319, + "step": 2980 + }, + { + "epoch": 0.7870278821435633, + "grad_norm": 0.054469238966703415, + "learning_rate": 0.00012136165721071382, + "loss": 0.0413, + "step": 2985 + }, + { + "epoch": 0.7883461868037703, + "grad_norm": 0.045610666275024414, + "learning_rate": 0.00012122971368254388, + "loss": 0.0521, + "step": 2990 + }, + { + "epoch": 0.7896644914639773, + "grad_norm": 0.4222470223903656, + "learning_rate": 0.00012109777015437393, + "loss": 0.0846, + "step": 2995 + }, + { + "epoch": 0.7909827961241843, + "grad_norm": 0.0272397268563509, + "learning_rate": 0.00012096582662620399, + "loss": 0.0364, + "step": 3000 + }, + { + "epoch": 0.7909827961241843, + "eval_loss": 0.033312585204839706, + "eval_runtime": 452.2552, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 3.728, + "step": 3000 + }, + { + "epoch": 0.7923011007843913, + "grad_norm": 0.08674059063196182, + "learning_rate": 0.00012083388309803406, + "loss": 0.0081, + "step": 3005 + }, + { + "epoch": 0.7936194054445982, + "grad_norm": 0.21960832178592682, + "learning_rate": 0.00012070193956986411, + "loss": 0.0468, + "step": 3010 + }, + { + "epoch": 0.7949377101048052, + "grad_norm": 0.11259289085865021, + "learning_rate": 0.00012056999604169417, + "loss": 0.0124, + "step": 3015 + }, + { + "epoch": 0.7962560147650122, + "grad_norm": 0.02945362776517868, + "learning_rate": 0.00012043805251352422, + "loss": 0.0298, + "step": 3020 + }, + { + "epoch": 0.7975743194252192, + "grad_norm": 0.27889615297317505, + "learning_rate": 0.00012030610898535428, + "loss": 0.0251, + "step": 3025 + }, + { + "epoch": 0.7988926240854262, + "grad_norm": 0.05873241275548935, + "learning_rate": 0.00012017416545718434, + "loss": 0.0132, + "step": 3030 + }, + { + "epoch": 0.8002109287456332, + "grad_norm": 0.1570046991109848, + "learning_rate": 0.00012004222192901439, + "loss": 0.0228, + "step": 3035 + }, + { + "epoch": 0.80152923340584, + "grad_norm": 0.12575332820415497, + "learning_rate": 0.00011991027840084443, + "loss": 0.0049, + "step": 3040 + }, + { + "epoch": 0.802847538066047, + "grad_norm": 0.8416435122489929, + "learning_rate": 0.00011977833487267449, + "loss": 0.0542, + "step": 3045 + }, + { + "epoch": 0.804165842726254, + "grad_norm": 0.2605098485946655, + "learning_rate": 0.00011964639134450454, + "loss": 0.0084, + "step": 3050 + }, + { + "epoch": 0.805484147386461, + "grad_norm": 0.8996294736862183, + "learning_rate": 0.00011951444781633461, + "loss": 0.0442, + "step": 3055 + }, + { + "epoch": 0.806802452046668, + "grad_norm": 2.7525105476379395, + "learning_rate": 0.00011938250428816467, + "loss": 0.0642, + "step": 3060 + }, + { + "epoch": 0.808120756706875, + "grad_norm": 0.14955930411815643, + "learning_rate": 0.00011925056075999473, + "loss": 0.0384, + "step": 3065 + }, + { + "epoch": 0.8094390613670819, + "grad_norm": 0.018756115809082985, + "learning_rate": 0.00011911861723182478, + "loss": 0.0154, + "step": 3070 + }, + { + "epoch": 0.8107573660272889, + "grad_norm": 0.23998615145683289, + "learning_rate": 0.00011898667370365484, + "loss": 0.0413, + "step": 3075 + }, + { + "epoch": 0.8120756706874959, + "grad_norm": 0.27253249287605286, + "learning_rate": 0.00011885473017548489, + "loss": 0.0081, + "step": 3080 + }, + { + "epoch": 0.8133939753477029, + "grad_norm": 0.2925993502140045, + "learning_rate": 0.00011872278664731495, + "loss": 0.0332, + "step": 3085 + }, + { + "epoch": 0.8147122800079099, + "grad_norm": 0.5364832878112793, + "learning_rate": 0.00011859084311914502, + "loss": 0.0143, + "step": 3090 + }, + { + "epoch": 0.8160305846681168, + "grad_norm": 0.32104921340942383, + "learning_rate": 0.00011845889959097507, + "loss": 0.0216, + "step": 3095 + }, + { + "epoch": 0.8173488893283237, + "grad_norm": 0.0205856766551733, + "learning_rate": 0.00011832695606280513, + "loss": 0.0346, + "step": 3100 + }, + { + "epoch": 0.8186671939885307, + "grad_norm": 0.2541547417640686, + "learning_rate": 0.00011819501253463518, + "loss": 0.0793, + "step": 3105 + }, + { + "epoch": 0.8199854986487377, + "grad_norm": 0.08333491533994675, + "learning_rate": 0.00011806306900646524, + "loss": 0.0049, + "step": 3110 + }, + { + "epoch": 0.8213038033089447, + "grad_norm": 0.0355968177318573, + "learning_rate": 0.0001179311254782953, + "loss": 0.0051, + "step": 3115 + }, + { + "epoch": 0.8226221079691517, + "grad_norm": 0.06948401033878326, + "learning_rate": 0.00011779918195012536, + "loss": 0.013, + "step": 3120 + }, + { + "epoch": 0.8239404126293587, + "grad_norm": 0.03328891843557358, + "learning_rate": 0.00011766723842195542, + "loss": 0.0122, + "step": 3125 + }, + { + "epoch": 0.8252587172895656, + "grad_norm": 0.013782350346446037, + "learning_rate": 0.00011753529489378548, + "loss": 0.0073, + "step": 3130 + }, + { + "epoch": 0.8265770219497726, + "grad_norm": 0.024390392005443573, + "learning_rate": 0.00011740335136561553, + "loss": 0.0143, + "step": 3135 + }, + { + "epoch": 0.8278953266099796, + "grad_norm": 0.002548128366470337, + "learning_rate": 0.00011727140783744557, + "loss": 0.0027, + "step": 3140 + }, + { + "epoch": 0.8292136312701865, + "grad_norm": 0.11674848943948746, + "learning_rate": 0.00011713946430927563, + "loss": 0.0253, + "step": 3145 + }, + { + "epoch": 0.8305319359303935, + "grad_norm": 0.005774884019047022, + "learning_rate": 0.00011700752078110568, + "loss": 0.0018, + "step": 3150 + }, + { + "epoch": 0.8318502405906005, + "grad_norm": 0.5763069987297058, + "learning_rate": 0.00011687557725293574, + "loss": 0.0119, + "step": 3155 + }, + { + "epoch": 0.8331685452508074, + "grad_norm": 0.0027607593219727278, + "learning_rate": 0.0001167436337247658, + "loss": 0.0279, + "step": 3160 + }, + { + "epoch": 0.8344868499110144, + "grad_norm": 1.859642505645752, + "learning_rate": 0.00011661169019659585, + "loss": 0.0228, + "step": 3165 + }, + { + "epoch": 0.8358051545712214, + "grad_norm": 0.16597022116184235, + "learning_rate": 0.00011647974666842592, + "loss": 0.1228, + "step": 3170 + }, + { + "epoch": 0.8371234592314284, + "grad_norm": 0.33833742141723633, + "learning_rate": 0.00011634780314025598, + "loss": 0.073, + "step": 3175 + }, + { + "epoch": 0.8384417638916354, + "grad_norm": 0.024682912975549698, + "learning_rate": 0.00011621585961208603, + "loss": 0.0042, + "step": 3180 + }, + { + "epoch": 0.8397600685518424, + "grad_norm": 0.05926942452788353, + "learning_rate": 0.00011608391608391609, + "loss": 0.0066, + "step": 3185 + }, + { + "epoch": 0.8410783732120493, + "grad_norm": 0.1414029747247696, + "learning_rate": 0.00011595197255574614, + "loss": 0.0603, + "step": 3190 + }, + { + "epoch": 0.8423966778722562, + "grad_norm": 0.37928736209869385, + "learning_rate": 0.0001158200290275762, + "loss": 0.0266, + "step": 3195 + }, + { + "epoch": 0.8437149825324632, + "grad_norm": 0.018329354003071785, + "learning_rate": 0.00011568808549940627, + "loss": 0.0047, + "step": 3200 + }, + { + "epoch": 0.8450332871926702, + "grad_norm": 0.2993735373020172, + "learning_rate": 0.00011555614197123632, + "loss": 0.0218, + "step": 3205 + }, + { + "epoch": 0.8463515918528772, + "grad_norm": 0.1767728328704834, + "learning_rate": 0.00011542419844306638, + "loss": 0.0363, + "step": 3210 + }, + { + "epoch": 0.8476698965130842, + "grad_norm": 0.39774414896965027, + "learning_rate": 0.00011529225491489644, + "loss": 0.0506, + "step": 3215 + }, + { + "epoch": 0.8489882011732911, + "grad_norm": 0.021896762773394585, + "learning_rate": 0.00011516031138672649, + "loss": 0.0081, + "step": 3220 + }, + { + "epoch": 0.8503065058334981, + "grad_norm": 0.358372300863266, + "learning_rate": 0.00011502836785855655, + "loss": 0.0224, + "step": 3225 + }, + { + "epoch": 0.8516248104937051, + "grad_norm": 0.01605542004108429, + "learning_rate": 0.00011489642433038662, + "loss": 0.0215, + "step": 3230 + }, + { + "epoch": 0.8529431151539121, + "grad_norm": 0.021189266815781593, + "learning_rate": 0.00011476448080221667, + "loss": 0.0051, + "step": 3235 + }, + { + "epoch": 0.8542614198141191, + "grad_norm": 0.013394076377153397, + "learning_rate": 0.0001146325372740467, + "loss": 0.021, + "step": 3240 + }, + { + "epoch": 0.8555797244743261, + "grad_norm": 0.19848507642745972, + "learning_rate": 0.00011450059374587676, + "loss": 0.0285, + "step": 3245 + }, + { + "epoch": 0.856898029134533, + "grad_norm": 0.2463046759366989, + "learning_rate": 0.00011436865021770683, + "loss": 0.0384, + "step": 3250 + }, + { + "epoch": 0.8582163337947399, + "grad_norm": 0.37432390451431274, + "learning_rate": 0.00011423670668953688, + "loss": 0.0098, + "step": 3255 + }, + { + "epoch": 0.8595346384549469, + "grad_norm": 0.060943394899368286, + "learning_rate": 0.00011410476316136694, + "loss": 0.0087, + "step": 3260 + }, + { + "epoch": 0.8608529431151539, + "grad_norm": 0.2846696674823761, + "learning_rate": 0.00011397281963319699, + "loss": 0.0148, + "step": 3265 + }, + { + "epoch": 0.8621712477753609, + "grad_norm": 0.009311323054134846, + "learning_rate": 0.00011384087610502705, + "loss": 0.0024, + "step": 3270 + }, + { + "epoch": 0.8634895524355679, + "grad_norm": 0.046277035027742386, + "learning_rate": 0.0001137089325768571, + "loss": 0.0274, + "step": 3275 + }, + { + "epoch": 0.8648078570957748, + "grad_norm": 0.006024620030075312, + "learning_rate": 0.00011357698904868716, + "loss": 0.0286, + "step": 3280 + }, + { + "epoch": 0.8661261617559818, + "grad_norm": 0.033578380942344666, + "learning_rate": 0.00011344504552051723, + "loss": 0.0153, + "step": 3285 + }, + { + "epoch": 0.8674444664161888, + "grad_norm": 0.8537917137145996, + "learning_rate": 0.00011331310199234728, + "loss": 0.0304, + "step": 3290 + }, + { + "epoch": 0.8687627710763958, + "grad_norm": 0.013933337293565273, + "learning_rate": 0.00011318115846417734, + "loss": 0.0112, + "step": 3295 + }, + { + "epoch": 0.8700810757366028, + "grad_norm": 0.35437721014022827, + "learning_rate": 0.0001130492149360074, + "loss": 0.0228, + "step": 3300 + }, + { + "epoch": 0.8713993803968098, + "grad_norm": 1.3024121522903442, + "learning_rate": 0.00011291727140783745, + "loss": 0.0203, + "step": 3305 + }, + { + "epoch": 0.8727176850570166, + "grad_norm": 0.5131255984306335, + "learning_rate": 0.00011278532787966751, + "loss": 0.0181, + "step": 3310 + }, + { + "epoch": 0.8740359897172236, + "grad_norm": 0.039366886019706726, + "learning_rate": 0.00011265338435149758, + "loss": 0.0192, + "step": 3315 + }, + { + "epoch": 0.8753542943774306, + "grad_norm": 0.13679669797420502, + "learning_rate": 0.00011252144082332763, + "loss": 0.004, + "step": 3320 + }, + { + "epoch": 0.8766725990376376, + "grad_norm": 0.003076886525377631, + "learning_rate": 0.00011238949729515769, + "loss": 0.0405, + "step": 3325 + }, + { + "epoch": 0.8779909036978446, + "grad_norm": 0.019953785464167595, + "learning_rate": 0.00011225755376698774, + "loss": 0.0241, + "step": 3330 + }, + { + "epoch": 0.8793092083580516, + "grad_norm": 0.007980377413332462, + "learning_rate": 0.0001121256102388178, + "loss": 0.0064, + "step": 3335 + }, + { + "epoch": 0.8806275130182585, + "grad_norm": 0.018761295825242996, + "learning_rate": 0.00011199366671064784, + "loss": 0.0032, + "step": 3340 + }, + { + "epoch": 0.8819458176784655, + "grad_norm": 0.022511709481477737, + "learning_rate": 0.0001118617231824779, + "loss": 0.0055, + "step": 3345 + }, + { + "epoch": 0.8832641223386725, + "grad_norm": 0.021270718425512314, + "learning_rate": 0.00011172977965430795, + "loss": 0.033, + "step": 3350 + }, + { + "epoch": 0.8845824269988795, + "grad_norm": 0.02710561640560627, + "learning_rate": 0.00011159783612613801, + "loss": 0.0094, + "step": 3355 + }, + { + "epoch": 0.8859007316590864, + "grad_norm": 0.4353378117084503, + "learning_rate": 0.00011146589259796806, + "loss": 0.0089, + "step": 3360 + }, + { + "epoch": 0.8872190363192934, + "grad_norm": 0.0257766991853714, + "learning_rate": 0.00011133394906979813, + "loss": 0.0059, + "step": 3365 + }, + { + "epoch": 0.8885373409795003, + "grad_norm": 0.80838942527771, + "learning_rate": 0.00011120200554162819, + "loss": 0.0263, + "step": 3370 + }, + { + "epoch": 0.8898556456397073, + "grad_norm": 0.007799761835485697, + "learning_rate": 0.00011107006201345824, + "loss": 0.0028, + "step": 3375 + }, + { + "epoch": 0.8911739502999143, + "grad_norm": 0.007315775845199823, + "learning_rate": 0.0001109381184852883, + "loss": 0.0127, + "step": 3380 + }, + { + "epoch": 0.8924922549601213, + "grad_norm": 1.4861233234405518, + "learning_rate": 0.00011080617495711836, + "loss": 0.0562, + "step": 3385 + }, + { + "epoch": 0.8938105596203283, + "grad_norm": 0.010219530202448368, + "learning_rate": 0.00011067423142894841, + "loss": 0.0438, + "step": 3390 + }, + { + "epoch": 0.8951288642805353, + "grad_norm": 1.0191857814788818, + "learning_rate": 0.00011054228790077848, + "loss": 0.0493, + "step": 3395 + }, + { + "epoch": 0.8964471689407422, + "grad_norm": 0.01459536887705326, + "learning_rate": 0.00011041034437260854, + "loss": 0.0117, + "step": 3400 + }, + { + "epoch": 0.8977654736009492, + "grad_norm": 0.008682495914399624, + "learning_rate": 0.00011027840084443859, + "loss": 0.02, + "step": 3405 + }, + { + "epoch": 0.8990837782611562, + "grad_norm": 0.02197263017296791, + "learning_rate": 0.00011014645731626865, + "loss": 0.0454, + "step": 3410 + }, + { + "epoch": 0.9004020829213631, + "grad_norm": 0.01436714269220829, + "learning_rate": 0.0001100145137880987, + "loss": 0.0283, + "step": 3415 + }, + { + "epoch": 0.9017203875815701, + "grad_norm": 0.14327946305274963, + "learning_rate": 0.00010988257025992876, + "loss": 0.0461, + "step": 3420 + }, + { + "epoch": 0.9030386922417771, + "grad_norm": 1.671773910522461, + "learning_rate": 0.00010975062673175883, + "loss": 0.054, + "step": 3425 + }, + { + "epoch": 0.904356996901984, + "grad_norm": 0.009926804341375828, + "learning_rate": 0.00010961868320358888, + "loss": 0.0429, + "step": 3430 + }, + { + "epoch": 0.905675301562191, + "grad_norm": 0.554020881652832, + "learning_rate": 0.00010948673967541894, + "loss": 0.0618, + "step": 3435 + }, + { + "epoch": 0.906993606222398, + "grad_norm": 0.1399248093366623, + "learning_rate": 0.00010935479614724897, + "loss": 0.0229, + "step": 3440 + }, + { + "epoch": 0.908311910882605, + "grad_norm": 0.02739197015762329, + "learning_rate": 0.00010922285261907904, + "loss": 0.0082, + "step": 3445 + }, + { + "epoch": 0.909630215542812, + "grad_norm": 0.33394527435302734, + "learning_rate": 0.00010909090909090909, + "loss": 0.0403, + "step": 3450 + }, + { + "epoch": 0.9109485202030189, + "grad_norm": 0.08083894103765488, + "learning_rate": 0.00010895896556273915, + "loss": 0.0406, + "step": 3455 + }, + { + "epoch": 0.9122668248632259, + "grad_norm": 0.39336663484573364, + "learning_rate": 0.0001088270220345692, + "loss": 0.02, + "step": 3460 + }, + { + "epoch": 0.9135851295234328, + "grad_norm": 0.20481553673744202, + "learning_rate": 0.00010869507850639926, + "loss": 0.0221, + "step": 3465 + }, + { + "epoch": 0.9149034341836398, + "grad_norm": 1.4507408142089844, + "learning_rate": 0.00010856313497822932, + "loss": 0.0357, + "step": 3470 + }, + { + "epoch": 0.9162217388438468, + "grad_norm": 0.2678806483745575, + "learning_rate": 0.00010843119145005937, + "loss": 0.0181, + "step": 3475 + }, + { + "epoch": 0.9175400435040538, + "grad_norm": 0.007361674215644598, + "learning_rate": 0.00010829924792188944, + "loss": 0.0978, + "step": 3480 + }, + { + "epoch": 0.9188583481642607, + "grad_norm": 0.773695707321167, + "learning_rate": 0.0001081673043937195, + "loss": 0.0401, + "step": 3485 + }, + { + "epoch": 0.9201766528244677, + "grad_norm": 0.0010772625682875514, + "learning_rate": 0.00010803536086554955, + "loss": 0.0233, + "step": 3490 + }, + { + "epoch": 0.9214949574846747, + "grad_norm": 0.08971104770898819, + "learning_rate": 0.00010790341733737961, + "loss": 0.0319, + "step": 3495 + }, + { + "epoch": 0.9228132621448817, + "grad_norm": 0.21372731029987335, + "learning_rate": 0.00010777147380920966, + "loss": 0.0315, + "step": 3500 + }, + { + "epoch": 0.9228132621448817, + "eval_loss": 0.02952708676457405, + "eval_runtime": 451.5837, + "eval_samples_per_second": 7.467, + "eval_steps_per_second": 3.734, + "step": 3500 + }, + { + "epoch": 0.9241315668050887, + "grad_norm": 0.016639264300465584, + "learning_rate": 0.00010763953028103972, + "loss": 0.0125, + "step": 3505 + }, + { + "epoch": 0.9254498714652957, + "grad_norm": 0.46340492367744446, + "learning_rate": 0.00010750758675286979, + "loss": 0.0186, + "step": 3510 + }, + { + "epoch": 0.9267681761255026, + "grad_norm": 0.01847526989877224, + "learning_rate": 0.00010737564322469984, + "loss": 0.0026, + "step": 3515 + }, + { + "epoch": 0.9280864807857095, + "grad_norm": 0.5947860479354858, + "learning_rate": 0.0001072436996965299, + "loss": 0.0259, + "step": 3520 + }, + { + "epoch": 0.9294047854459165, + "grad_norm": 0.06145291402935982, + "learning_rate": 0.00010711175616835995, + "loss": 0.0057, + "step": 3525 + }, + { + "epoch": 0.9307230901061235, + "grad_norm": 0.0143959429115057, + "learning_rate": 0.00010697981264019001, + "loss": 0.0145, + "step": 3530 + }, + { + "epoch": 0.9320413947663305, + "grad_norm": 0.21143831312656403, + "learning_rate": 0.00010684786911202007, + "loss": 0.0459, + "step": 3535 + }, + { + "epoch": 0.9333596994265375, + "grad_norm": 0.02548077143728733, + "learning_rate": 0.00010671592558385011, + "loss": 0.0051, + "step": 3540 + }, + { + "epoch": 0.9346780040867444, + "grad_norm": 0.008077048696577549, + "learning_rate": 0.00010658398205568016, + "loss": 0.0306, + "step": 3545 + }, + { + "epoch": 0.9359963087469514, + "grad_norm": 0.0030760422814637423, + "learning_rate": 0.00010645203852751022, + "loss": 0.0575, + "step": 3550 + }, + { + "epoch": 0.9373146134071584, + "grad_norm": 0.18114158511161804, + "learning_rate": 0.00010632009499934027, + "loss": 0.0885, + "step": 3555 + }, + { + "epoch": 0.9386329180673654, + "grad_norm": 0.02450549602508545, + "learning_rate": 0.00010618815147117034, + "loss": 0.0045, + "step": 3560 + }, + { + "epoch": 0.9399512227275724, + "grad_norm": 0.1238626018166542, + "learning_rate": 0.0001060562079430004, + "loss": 0.0166, + "step": 3565 + }, + { + "epoch": 0.9412695273877794, + "grad_norm": 0.1879919469356537, + "learning_rate": 0.00010592426441483046, + "loss": 0.0077, + "step": 3570 + }, + { + "epoch": 0.9425878320479862, + "grad_norm": 0.11323565989732742, + "learning_rate": 0.00010579232088666051, + "loss": 0.0213, + "step": 3575 + }, + { + "epoch": 0.9439061367081932, + "grad_norm": 0.35575854778289795, + "learning_rate": 0.00010566037735849057, + "loss": 0.0336, + "step": 3580 + }, + { + "epoch": 0.9452244413684002, + "grad_norm": 0.14052227139472961, + "learning_rate": 0.00010552843383032062, + "loss": 0.0325, + "step": 3585 + }, + { + "epoch": 0.9465427460286072, + "grad_norm": 0.2643798887729645, + "learning_rate": 0.00010539649030215069, + "loss": 0.0192, + "step": 3590 + }, + { + "epoch": 0.9478610506888142, + "grad_norm": 0.3207031190395355, + "learning_rate": 0.00010526454677398075, + "loss": 0.0221, + "step": 3595 + }, + { + "epoch": 0.9491793553490212, + "grad_norm": 0.022803861647844315, + "learning_rate": 0.0001051326032458108, + "loss": 0.029, + "step": 3600 + }, + { + "epoch": 0.9504976600092281, + "grad_norm": 0.02511664852499962, + "learning_rate": 0.00010500065971764086, + "loss": 0.0422, + "step": 3605 + }, + { + "epoch": 0.9518159646694351, + "grad_norm": 0.06505445390939713, + "learning_rate": 0.00010486871618947091, + "loss": 0.0092, + "step": 3610 + }, + { + "epoch": 0.9531342693296421, + "grad_norm": 0.09998584538698196, + "learning_rate": 0.00010473677266130097, + "loss": 0.0242, + "step": 3615 + }, + { + "epoch": 0.9544525739898491, + "grad_norm": 0.9645698666572571, + "learning_rate": 0.00010460482913313104, + "loss": 0.0124, + "step": 3620 + }, + { + "epoch": 0.955770878650056, + "grad_norm": 0.2389964610338211, + "learning_rate": 0.0001044728856049611, + "loss": 0.0169, + "step": 3625 + }, + { + "epoch": 0.957089183310263, + "grad_norm": 2.030608654022217, + "learning_rate": 0.00010434094207679115, + "loss": 0.0518, + "step": 3630 + }, + { + "epoch": 0.9584074879704699, + "grad_norm": 0.05979987606406212, + "learning_rate": 0.0001042089985486212, + "loss": 0.0081, + "step": 3635 + }, + { + "epoch": 0.9597257926306769, + "grad_norm": 0.15761719644069672, + "learning_rate": 0.00010407705502045125, + "loss": 0.0061, + "step": 3640 + }, + { + "epoch": 0.9610440972908839, + "grad_norm": 0.6534290909767151, + "learning_rate": 0.0001039451114922813, + "loss": 0.0104, + "step": 3645 + }, + { + "epoch": 0.9623624019510909, + "grad_norm": 1.0324147939682007, + "learning_rate": 0.00010381316796411136, + "loss": 0.0381, + "step": 3650 + }, + { + "epoch": 0.9636807066112979, + "grad_norm": 0.002968872431665659, + "learning_rate": 0.00010368122443594142, + "loss": 0.0343, + "step": 3655 + }, + { + "epoch": 0.9649990112715049, + "grad_norm": 0.011243184097111225, + "learning_rate": 0.00010354928090777147, + "loss": 0.019, + "step": 3660 + }, + { + "epoch": 0.9663173159317118, + "grad_norm": 0.17663739621639252, + "learning_rate": 0.00010341733737960153, + "loss": 0.0452, + "step": 3665 + }, + { + "epoch": 0.9676356205919188, + "grad_norm": 1.2647719383239746, + "learning_rate": 0.00010328539385143158, + "loss": 0.0154, + "step": 3670 + }, + { + "epoch": 0.9689539252521258, + "grad_norm": 0.3691752552986145, + "learning_rate": 0.00010315345032326165, + "loss": 0.028, + "step": 3675 + }, + { + "epoch": 0.9702722299123328, + "grad_norm": 0.0015879774000495672, + "learning_rate": 0.00010302150679509171, + "loss": 0.0202, + "step": 3680 + }, + { + "epoch": 0.9715905345725397, + "grad_norm": 0.1441984623670578, + "learning_rate": 0.00010288956326692176, + "loss": 0.0221, + "step": 3685 + }, + { + "epoch": 0.9729088392327467, + "grad_norm": 0.20431455969810486, + "learning_rate": 0.00010275761973875182, + "loss": 0.0072, + "step": 3690 + }, + { + "epoch": 0.9742271438929536, + "grad_norm": 0.861625611782074, + "learning_rate": 0.00010262567621058187, + "loss": 0.0523, + "step": 3695 + }, + { + "epoch": 0.9755454485531606, + "grad_norm": 0.005049478262662888, + "learning_rate": 0.00010249373268241193, + "loss": 0.0051, + "step": 3700 + }, + { + "epoch": 0.9768637532133676, + "grad_norm": 0.49685510993003845, + "learning_rate": 0.000102361789154242, + "loss": 0.023, + "step": 3705 + }, + { + "epoch": 0.9781820578735746, + "grad_norm": 0.08789395540952682, + "learning_rate": 0.00010222984562607205, + "loss": 0.0159, + "step": 3710 + }, + { + "epoch": 0.9795003625337816, + "grad_norm": 0.027168691158294678, + "learning_rate": 0.00010209790209790211, + "loss": 0.0083, + "step": 3715 + }, + { + "epoch": 0.9808186671939886, + "grad_norm": 0.0006773864733986557, + "learning_rate": 0.00010196595856973217, + "loss": 0.0048, + "step": 3720 + }, + { + "epoch": 0.9821369718541955, + "grad_norm": 0.01636457070708275, + "learning_rate": 0.00010183401504156222, + "loss": 0.0159, + "step": 3725 + }, + { + "epoch": 0.9834552765144025, + "grad_norm": 0.10160859674215317, + "learning_rate": 0.00010170207151339228, + "loss": 0.0047, + "step": 3730 + }, + { + "epoch": 0.9847735811746094, + "grad_norm": 0.14173269271850586, + "learning_rate": 0.00010157012798522232, + "loss": 0.006, + "step": 3735 + }, + { + "epoch": 0.9860918858348164, + "grad_norm": 0.003458512481302023, + "learning_rate": 0.00010143818445705238, + "loss": 0.0193, + "step": 3740 + }, + { + "epoch": 0.9874101904950234, + "grad_norm": 0.005163820460438728, + "learning_rate": 0.00010130624092888243, + "loss": 0.0039, + "step": 3745 + }, + { + "epoch": 0.9887284951552304, + "grad_norm": 0.005913791712373495, + "learning_rate": 0.00010117429740071249, + "loss": 0.0119, + "step": 3750 + }, + { + "epoch": 0.9900467998154373, + "grad_norm": 0.00800853967666626, + "learning_rate": 0.00010104235387254256, + "loss": 0.044, + "step": 3755 + }, + { + "epoch": 0.9913651044756443, + "grad_norm": 0.18146778643131256, + "learning_rate": 0.00010091041034437261, + "loss": 0.0048, + "step": 3760 + }, + { + "epoch": 0.9926834091358513, + "grad_norm": 0.01235104724764824, + "learning_rate": 0.00010077846681620267, + "loss": 0.0017, + "step": 3765 + }, + { + "epoch": 0.9940017137960583, + "grad_norm": 0.17677897214889526, + "learning_rate": 0.00010064652328803272, + "loss": 0.0339, + "step": 3770 + }, + { + "epoch": 0.9953200184562653, + "grad_norm": 0.0017472271574661136, + "learning_rate": 0.00010051457975986278, + "loss": 0.0494, + "step": 3775 + }, + { + "epoch": 0.9966383231164723, + "grad_norm": 0.10814860463142395, + "learning_rate": 0.00010038263623169283, + "loss": 0.0741, + "step": 3780 + }, + { + "epoch": 0.9979566277766792, + "grad_norm": 0.11329760402441025, + "learning_rate": 0.0001002506927035229, + "loss": 0.0182, + "step": 3785 + }, + { + "epoch": 0.9992749324368861, + "grad_norm": 0.11573276668787003, + "learning_rate": 0.00010011874917535296, + "loss": 0.0068, + "step": 3790 + }, + { + "epoch": 1.000790982796124, + "grad_norm": 0.08449886739253998, + "learning_rate": 9.998680564718301e-05, + "loss": 0.0141, + "step": 3795 + }, + { + "epoch": 1.002109287456331, + "grad_norm": 0.05035184696316719, + "learning_rate": 9.985486211901307e-05, + "loss": 0.0293, + "step": 3800 + }, + { + "epoch": 1.003427592116538, + "grad_norm": 0.0255444198846817, + "learning_rate": 9.972291859084313e-05, + "loss": 0.0054, + "step": 3805 + }, + { + "epoch": 1.004745896776745, + "grad_norm": 0.0033677336759865284, + "learning_rate": 9.959097506267318e-05, + "loss": 0.0567, + "step": 3810 + }, + { + "epoch": 1.006064201436952, + "grad_norm": 0.09453682601451874, + "learning_rate": 9.945903153450324e-05, + "loss": 0.0589, + "step": 3815 + }, + { + "epoch": 1.007382506097159, + "grad_norm": 0.01592979207634926, + "learning_rate": 9.932708800633329e-05, + "loss": 0.0043, + "step": 3820 + }, + { + "epoch": 1.008700810757366, + "grad_norm": 0.002263693604618311, + "learning_rate": 9.919514447816335e-05, + "loss": 0.0195, + "step": 3825 + }, + { + "epoch": 1.010019115417573, + "grad_norm": 0.013390793465077877, + "learning_rate": 9.90632009499934e-05, + "loss": 0.0152, + "step": 3830 + }, + { + "epoch": 1.01133742007778, + "grad_norm": 0.10473847389221191, + "learning_rate": 9.893125742182346e-05, + "loss": 0.0606, + "step": 3835 + }, + { + "epoch": 1.012655724737987, + "grad_norm": 0.05837221071124077, + "learning_rate": 9.879931389365353e-05, + "loss": 0.0121, + "step": 3840 + }, + { + "epoch": 1.013974029398194, + "grad_norm": 0.3803791105747223, + "learning_rate": 9.866737036548358e-05, + "loss": 0.0386, + "step": 3845 + }, + { + "epoch": 1.0152923340584008, + "grad_norm": 0.4067519009113312, + "learning_rate": 9.853542683731364e-05, + "loss": 0.0115, + "step": 3850 + }, + { + "epoch": 1.0166106387186078, + "grad_norm": 0.02585229091346264, + "learning_rate": 9.84034833091437e-05, + "loss": 0.0214, + "step": 3855 + }, + { + "epoch": 1.0179289433788148, + "grad_norm": 0.03670825809240341, + "learning_rate": 9.827153978097374e-05, + "loss": 0.0059, + "step": 3860 + }, + { + "epoch": 1.0192472480390218, + "grad_norm": 0.014171554706990719, + "learning_rate": 9.81395962528038e-05, + "loss": 0.0145, + "step": 3865 + }, + { + "epoch": 1.0205655526992288, + "grad_norm": 0.027376385405659676, + "learning_rate": 9.800765272463386e-05, + "loss": 0.0089, + "step": 3870 + }, + { + "epoch": 1.0218838573594358, + "grad_norm": 0.03168405964970589, + "learning_rate": 9.787570919646392e-05, + "loss": 0.0132, + "step": 3875 + }, + { + "epoch": 1.0232021620196428, + "grad_norm": 0.03346199914813042, + "learning_rate": 9.774376566829397e-05, + "loss": 0.0246, + "step": 3880 + }, + { + "epoch": 1.0245204666798498, + "grad_norm": 0.00894144270569086, + "learning_rate": 9.761182214012403e-05, + "loss": 0.0105, + "step": 3885 + }, + { + "epoch": 1.0258387713400567, + "grad_norm": 0.3172806203365326, + "learning_rate": 9.747987861195409e-05, + "loss": 0.0103, + "step": 3890 + }, + { + "epoch": 1.0271570760002637, + "grad_norm": 0.009055040776729584, + "learning_rate": 9.734793508378414e-05, + "loss": 0.0103, + "step": 3895 + }, + { + "epoch": 1.0284753806604707, + "grad_norm": 0.014140011742711067, + "learning_rate": 9.721599155561421e-05, + "loss": 0.0037, + "step": 3900 + }, + { + "epoch": 1.0297936853206777, + "grad_norm": 0.008317383006215096, + "learning_rate": 9.708404802744427e-05, + "loss": 0.002, + "step": 3905 + }, + { + "epoch": 1.0311119899808845, + "grad_norm": 0.005038558971136808, + "learning_rate": 9.695210449927431e-05, + "loss": 0.0017, + "step": 3910 + }, + { + "epoch": 1.0324302946410915, + "grad_norm": 0.40058520436286926, + "learning_rate": 9.682016097110436e-05, + "loss": 0.0065, + "step": 3915 + }, + { + "epoch": 1.0337485993012985, + "grad_norm": 0.005197151098400354, + "learning_rate": 9.668821744293442e-05, + "loss": 0.0031, + "step": 3920 + }, + { + "epoch": 1.0350669039615055, + "grad_norm": 0.014353781007230282, + "learning_rate": 9.655627391476449e-05, + "loss": 0.0009, + "step": 3925 + }, + { + "epoch": 1.0363852086217125, + "grad_norm": 0.13260559737682343, + "learning_rate": 9.642433038659454e-05, + "loss": 0.0323, + "step": 3930 + }, + { + "epoch": 1.0377035132819195, + "grad_norm": 0.006795065477490425, + "learning_rate": 9.62923868584246e-05, + "loss": 0.0022, + "step": 3935 + }, + { + "epoch": 1.0390218179421264, + "grad_norm": 0.2276086062192917, + "learning_rate": 9.616044333025466e-05, + "loss": 0.0221, + "step": 3940 + }, + { + "epoch": 1.0403401226023334, + "grad_norm": 0.06121920794248581, + "learning_rate": 9.602849980208471e-05, + "loss": 0.0037, + "step": 3945 + }, + { + "epoch": 1.0416584272625404, + "grad_norm": 0.9180755019187927, + "learning_rate": 9.589655627391477e-05, + "loss": 0.0589, + "step": 3950 + }, + { + "epoch": 1.0429767319227474, + "grad_norm": 0.07515591382980347, + "learning_rate": 9.576461274574484e-05, + "loss": 0.0653, + "step": 3955 + }, + { + "epoch": 1.0442950365829544, + "grad_norm": 0.018060607835650444, + "learning_rate": 9.563266921757488e-05, + "loss": 0.0178, + "step": 3960 + }, + { + "epoch": 1.0456133412431612, + "grad_norm": 0.02751368284225464, + "learning_rate": 9.550072568940493e-05, + "loss": 0.0076, + "step": 3965 + }, + { + "epoch": 1.0469316459033682, + "grad_norm": 0.653998613357544, + "learning_rate": 9.536878216123499e-05, + "loss": 0.0066, + "step": 3970 + }, + { + "epoch": 1.0482499505635752, + "grad_norm": 0.3117768168449402, + "learning_rate": 9.523683863306505e-05, + "loss": 0.0087, + "step": 3975 + }, + { + "epoch": 1.0495682552237822, + "grad_norm": 0.013952831737697124, + "learning_rate": 9.510489510489511e-05, + "loss": 0.0037, + "step": 3980 + }, + { + "epoch": 1.0508865598839892, + "grad_norm": 0.01806250400841236, + "learning_rate": 9.497295157672517e-05, + "loss": 0.0028, + "step": 3985 + }, + { + "epoch": 1.0522048645441962, + "grad_norm": 0.13678006827831268, + "learning_rate": 9.484100804855523e-05, + "loss": 0.0533, + "step": 3990 + }, + { + "epoch": 1.0535231692044031, + "grad_norm": 0.14869382977485657, + "learning_rate": 9.470906452038528e-05, + "loss": 0.009, + "step": 3995 + }, + { + "epoch": 1.0548414738646101, + "grad_norm": 0.33614659309387207, + "learning_rate": 9.457712099221534e-05, + "loss": 0.0555, + "step": 4000 + }, + { + "epoch": 1.0548414738646101, + "eval_loss": 0.026165226474404335, + "eval_runtime": 452.2482, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 3.728, + "step": 4000 + }, + { + "epoch": 1.0561597785248171, + "grad_norm": 0.007546027656644583, + "learning_rate": 9.444517746404539e-05, + "loss": 0.0029, + "step": 4005 + }, + { + "epoch": 1.0574780831850241, + "grad_norm": 0.3720332384109497, + "learning_rate": 9.431323393587545e-05, + "loss": 0.0353, + "step": 4010 + }, + { + "epoch": 1.0587963878452311, + "grad_norm": 1.1335264444351196, + "learning_rate": 9.41812904077055e-05, + "loss": 0.0142, + "step": 4015 + }, + { + "epoch": 1.060114692505438, + "grad_norm": 0.024723488837480545, + "learning_rate": 9.404934687953556e-05, + "loss": 0.006, + "step": 4020 + }, + { + "epoch": 1.0614329971656449, + "grad_norm": 0.040354058146476746, + "learning_rate": 9.391740335136562e-05, + "loss": 0.0107, + "step": 4025 + }, + { + "epoch": 1.0627513018258519, + "grad_norm": 0.222810298204422, + "learning_rate": 9.378545982319567e-05, + "loss": 0.0273, + "step": 4030 + }, + { + "epoch": 1.0640696064860589, + "grad_norm": 0.025684095919132233, + "learning_rate": 9.365351629502574e-05, + "loss": 0.0033, + "step": 4035 + }, + { + "epoch": 1.0653879111462659, + "grad_norm": 0.05338352546095848, + "learning_rate": 9.35215727668558e-05, + "loss": 0.0052, + "step": 4040 + }, + { + "epoch": 1.0667062158064728, + "grad_norm": 0.06182330474257469, + "learning_rate": 9.338962923868585e-05, + "loss": 0.0038, + "step": 4045 + }, + { + "epoch": 1.0680245204666798, + "grad_norm": 0.012170832604169846, + "learning_rate": 9.325768571051591e-05, + "loss": 0.0018, + "step": 4050 + }, + { + "epoch": 1.0693428251268868, + "grad_norm": 0.5424306392669678, + "learning_rate": 9.312574218234596e-05, + "loss": 0.0445, + "step": 4055 + }, + { + "epoch": 1.0706611297870938, + "grad_norm": 0.017939254641532898, + "learning_rate": 9.299379865417602e-05, + "loss": 0.0389, + "step": 4060 + }, + { + "epoch": 1.0719794344473008, + "grad_norm": 0.0060431682504713535, + "learning_rate": 9.286185512600607e-05, + "loss": 0.0025, + "step": 4065 + }, + { + "epoch": 1.0732977391075078, + "grad_norm": 0.0071444883942604065, + "learning_rate": 9.272991159783613e-05, + "loss": 0.0333, + "step": 4070 + }, + { + "epoch": 1.0746160437677148, + "grad_norm": 0.29632750153541565, + "learning_rate": 9.259796806966619e-05, + "loss": 0.0151, + "step": 4075 + }, + { + "epoch": 1.0759343484279218, + "grad_norm": 0.004526323173195124, + "learning_rate": 9.246602454149624e-05, + "loss": 0.006, + "step": 4080 + }, + { + "epoch": 1.0772526530881286, + "grad_norm": 0.023945212364196777, + "learning_rate": 9.23340810133263e-05, + "loss": 0.004, + "step": 4085 + }, + { + "epoch": 1.0785709577483356, + "grad_norm": 0.13235126435756683, + "learning_rate": 9.220213748515635e-05, + "loss": 0.0059, + "step": 4090 + }, + { + "epoch": 1.0798892624085425, + "grad_norm": 0.17592330276966095, + "learning_rate": 9.207019395698642e-05, + "loss": 0.0302, + "step": 4095 + }, + { + "epoch": 1.0812075670687495, + "grad_norm": 0.004582866560667753, + "learning_rate": 9.193825042881648e-05, + "loss": 0.009, + "step": 4100 + }, + { + "epoch": 1.0825258717289565, + "grad_norm": 0.15214525163173676, + "learning_rate": 9.180630690064653e-05, + "loss": 0.0062, + "step": 4105 + }, + { + "epoch": 1.0838441763891635, + "grad_norm": 0.16535983979701996, + "learning_rate": 9.167436337247658e-05, + "loss": 0.0926, + "step": 4110 + }, + { + "epoch": 1.0851624810493705, + "grad_norm": 0.013285227119922638, + "learning_rate": 9.154241984430663e-05, + "loss": 0.0043, + "step": 4115 + }, + { + "epoch": 1.0864807857095775, + "grad_norm": 0.012116984464228153, + "learning_rate": 9.14104763161367e-05, + "loss": 0.0037, + "step": 4120 + }, + { + "epoch": 1.0877990903697845, + "grad_norm": 0.0373845212161541, + "learning_rate": 9.127853278796676e-05, + "loss": 0.0081, + "step": 4125 + }, + { + "epoch": 1.0891173950299915, + "grad_norm": 0.09324615448713303, + "learning_rate": 9.114658925979681e-05, + "loss": 0.0534, + "step": 4130 + }, + { + "epoch": 1.0904356996901985, + "grad_norm": 0.010992968454957008, + "learning_rate": 9.101464573162687e-05, + "loss": 0.0025, + "step": 4135 + }, + { + "epoch": 1.0917540043504055, + "grad_norm": 0.13710318505764008, + "learning_rate": 9.088270220345692e-05, + "loss": 0.0555, + "step": 4140 + }, + { + "epoch": 1.0930723090106123, + "grad_norm": 0.010403074324131012, + "learning_rate": 9.075075867528698e-05, + "loss": 0.0042, + "step": 4145 + }, + { + "epoch": 1.0943906136708192, + "grad_norm": 0.21544460952281952, + "learning_rate": 9.061881514711705e-05, + "loss": 0.0144, + "step": 4150 + }, + { + "epoch": 1.0957089183310262, + "grad_norm": 0.04194799065589905, + "learning_rate": 9.04868716189471e-05, + "loss": 0.0106, + "step": 4155 + }, + { + "epoch": 1.0970272229912332, + "grad_norm": 0.029204202815890312, + "learning_rate": 9.035492809077715e-05, + "loss": 0.0085, + "step": 4160 + }, + { + "epoch": 1.0983455276514402, + "grad_norm": 0.006751026958227158, + "learning_rate": 9.02229845626072e-05, + "loss": 0.0049, + "step": 4165 + }, + { + "epoch": 1.0996638323116472, + "grad_norm": 0.008232722990214825, + "learning_rate": 9.009104103443726e-05, + "loss": 0.0172, + "step": 4170 + }, + { + "epoch": 1.1009821369718542, + "grad_norm": 0.05630079656839371, + "learning_rate": 8.995909750626733e-05, + "loss": 0.0112, + "step": 4175 + }, + { + "epoch": 1.1023004416320612, + "grad_norm": 0.0011601662263274193, + "learning_rate": 8.982715397809738e-05, + "loss": 0.0317, + "step": 4180 + }, + { + "epoch": 1.1036187462922682, + "grad_norm": 0.006554402410984039, + "learning_rate": 8.969521044992744e-05, + "loss": 0.0035, + "step": 4185 + }, + { + "epoch": 1.1049370509524752, + "grad_norm": 0.34513652324676514, + "learning_rate": 8.956326692175749e-05, + "loss": 0.0036, + "step": 4190 + }, + { + "epoch": 1.1062553556126822, + "grad_norm": 0.283669650554657, + "learning_rate": 8.943132339358755e-05, + "loss": 0.0182, + "step": 4195 + }, + { + "epoch": 1.1075736602728892, + "grad_norm": 0.5376952290534973, + "learning_rate": 8.92993798654176e-05, + "loss": 0.0293, + "step": 4200 + }, + { + "epoch": 1.108891964933096, + "grad_norm": 0.01689724065363407, + "learning_rate": 8.916743633724767e-05, + "loss": 0.0206, + "step": 4205 + }, + { + "epoch": 1.110210269593303, + "grad_norm": 0.026538770645856857, + "learning_rate": 8.903549280907772e-05, + "loss": 0.0181, + "step": 4210 + }, + { + "epoch": 1.11152857425351, + "grad_norm": 0.6372873783111572, + "learning_rate": 8.890354928090777e-05, + "loss": 0.021, + "step": 4215 + }, + { + "epoch": 1.112846878913717, + "grad_norm": 0.06177428737282753, + "learning_rate": 8.877160575273783e-05, + "loss": 0.0033, + "step": 4220 + }, + { + "epoch": 1.114165183573924, + "grad_norm": 0.3712109923362732, + "learning_rate": 8.863966222456788e-05, + "loss": 0.0075, + "step": 4225 + }, + { + "epoch": 1.115483488234131, + "grad_norm": 0.030514653772115707, + "learning_rate": 8.850771869639795e-05, + "loss": 0.0183, + "step": 4230 + }, + { + "epoch": 1.116801792894338, + "grad_norm": 0.012861707247793674, + "learning_rate": 8.837577516822801e-05, + "loss": 0.0032, + "step": 4235 + }, + { + "epoch": 1.118120097554545, + "grad_norm": 0.3278522789478302, + "learning_rate": 8.824383164005806e-05, + "loss": 0.0058, + "step": 4240 + }, + { + "epoch": 1.1194384022147519, + "grad_norm": 0.580259382724762, + "learning_rate": 8.811188811188812e-05, + "loss": 0.0068, + "step": 4245 + }, + { + "epoch": 1.1207567068749589, + "grad_norm": 0.007002575788646936, + "learning_rate": 8.797994458371817e-05, + "loss": 0.0063, + "step": 4250 + }, + { + "epoch": 1.1220750115351659, + "grad_norm": 0.22484643757343292, + "learning_rate": 8.784800105554823e-05, + "loss": 0.0167, + "step": 4255 + }, + { + "epoch": 1.1233933161953726, + "grad_norm": 0.004122686106711626, + "learning_rate": 8.771605752737829e-05, + "loss": 0.002, + "step": 4260 + }, + { + "epoch": 1.1247116208555796, + "grad_norm": 0.009832561016082764, + "learning_rate": 8.758411399920834e-05, + "loss": 0.0029, + "step": 4265 + }, + { + "epoch": 1.1260299255157866, + "grad_norm": 0.04854527860879898, + "learning_rate": 8.74521704710384e-05, + "loss": 0.0068, + "step": 4270 + }, + { + "epoch": 1.1273482301759936, + "grad_norm": 0.12221235036849976, + "learning_rate": 8.732022694286845e-05, + "loss": 0.003, + "step": 4275 + }, + { + "epoch": 1.1286665348362006, + "grad_norm": 0.005857539363205433, + "learning_rate": 8.718828341469851e-05, + "loss": 0.0022, + "step": 4280 + }, + { + "epoch": 1.1299848394964076, + "grad_norm": 0.10582758486270905, + "learning_rate": 8.705633988652856e-05, + "loss": 0.002, + "step": 4285 + }, + { + "epoch": 1.1313031441566146, + "grad_norm": 0.006190940272063017, + "learning_rate": 8.692439635835863e-05, + "loss": 0.0022, + "step": 4290 + }, + { + "epoch": 1.1326214488168216, + "grad_norm": 0.00221514655277133, + "learning_rate": 8.679245283018869e-05, + "loss": 0.0314, + "step": 4295 + }, + { + "epoch": 1.1339397534770286, + "grad_norm": 0.0796755850315094, + "learning_rate": 8.666050930201874e-05, + "loss": 0.0347, + "step": 4300 + }, + { + "epoch": 1.1352580581372356, + "grad_norm": 0.20088806748390198, + "learning_rate": 8.65285657738488e-05, + "loss": 0.0048, + "step": 4305 + }, + { + "epoch": 1.1365763627974426, + "grad_norm": 0.4018377363681793, + "learning_rate": 8.639662224567884e-05, + "loss": 0.0234, + "step": 4310 + }, + { + "epoch": 1.1378946674576496, + "grad_norm": 0.014961684122681618, + "learning_rate": 8.626467871750891e-05, + "loss": 0.0033, + "step": 4315 + }, + { + "epoch": 1.1392129721178565, + "grad_norm": 0.004534922540187836, + "learning_rate": 8.613273518933897e-05, + "loss": 0.0021, + "step": 4320 + }, + { + "epoch": 1.1405312767780633, + "grad_norm": 0.06340984255075455, + "learning_rate": 8.600079166116902e-05, + "loss": 0.0538, + "step": 4325 + }, + { + "epoch": 1.1418495814382703, + "grad_norm": 0.007374623324722052, + "learning_rate": 8.586884813299908e-05, + "loss": 0.0157, + "step": 4330 + }, + { + "epoch": 1.1431678860984773, + "grad_norm": 0.02313193492591381, + "learning_rate": 8.573690460482913e-05, + "loss": 0.0307, + "step": 4335 + }, + { + "epoch": 1.1444861907586843, + "grad_norm": 0.014071634039282799, + "learning_rate": 8.560496107665919e-05, + "loss": 0.0058, + "step": 4340 + }, + { + "epoch": 1.1458044954188913, + "grad_norm": 1.4664901494979858, + "learning_rate": 8.547301754848926e-05, + "loss": 0.0566, + "step": 4345 + }, + { + "epoch": 1.1471228000790983, + "grad_norm": 0.023680074140429497, + "learning_rate": 8.534107402031931e-05, + "loss": 0.0048, + "step": 4350 + }, + { + "epoch": 1.1484411047393053, + "grad_norm": 0.012555698864161968, + "learning_rate": 8.520913049214937e-05, + "loss": 0.0076, + "step": 4355 + }, + { + "epoch": 1.1497594093995123, + "grad_norm": 0.013624129816889763, + "learning_rate": 8.507718696397941e-05, + "loss": 0.0373, + "step": 4360 + }, + { + "epoch": 1.1510777140597193, + "grad_norm": 0.015372387133538723, + "learning_rate": 8.494524343580947e-05, + "loss": 0.0147, + "step": 4365 + }, + { + "epoch": 1.1523960187199263, + "grad_norm": 0.3312993347644806, + "learning_rate": 8.481329990763954e-05, + "loss": 0.0299, + "step": 4370 + }, + { + "epoch": 1.1537143233801332, + "grad_norm": 0.023838184773921967, + "learning_rate": 8.468135637946959e-05, + "loss": 0.0226, + "step": 4375 + }, + { + "epoch": 1.15503262804034, + "grad_norm": 0.42516952753067017, + "learning_rate": 8.454941285129965e-05, + "loss": 0.0088, + "step": 4380 + }, + { + "epoch": 1.156350932700547, + "grad_norm": 0.6900278925895691, + "learning_rate": 8.44174693231297e-05, + "loss": 0.0245, + "step": 4385 + }, + { + "epoch": 1.157669237360754, + "grad_norm": 0.2932703197002411, + "learning_rate": 8.428552579495976e-05, + "loss": 0.0207, + "step": 4390 + }, + { + "epoch": 1.158987542020961, + "grad_norm": 0.12942780554294586, + "learning_rate": 8.415358226678982e-05, + "loss": 0.0037, + "step": 4395 + }, + { + "epoch": 1.160305846681168, + "grad_norm": 0.9499046802520752, + "learning_rate": 8.402163873861989e-05, + "loss": 0.0246, + "step": 4400 + }, + { + "epoch": 1.161624151341375, + "grad_norm": 0.008869118988513947, + "learning_rate": 8.388969521044994e-05, + "loss": 0.0171, + "step": 4405 + }, + { + "epoch": 1.162942456001582, + "grad_norm": 1.7409231662750244, + "learning_rate": 8.375775168227998e-05, + "loss": 0.017, + "step": 4410 + }, + { + "epoch": 1.164260760661789, + "grad_norm": 0.0020101398695260286, + "learning_rate": 8.362580815411004e-05, + "loss": 0.0027, + "step": 4415 + }, + { + "epoch": 1.165579065321996, + "grad_norm": 0.0785067081451416, + "learning_rate": 8.34938646259401e-05, + "loss": 0.0043, + "step": 4420 + }, + { + "epoch": 1.166897369982203, + "grad_norm": 0.0029506285209208727, + "learning_rate": 8.336192109777016e-05, + "loss": 0.0109, + "step": 4425 + }, + { + "epoch": 1.16821567464241, + "grad_norm": 0.02216683328151703, + "learning_rate": 8.322997756960022e-05, + "loss": 0.0026, + "step": 4430 + }, + { + "epoch": 1.1695339793026167, + "grad_norm": 0.02216639369726181, + "learning_rate": 8.309803404143027e-05, + "loss": 0.0045, + "step": 4435 + }, + { + "epoch": 1.170852283962824, + "grad_norm": 0.0, + "learning_rate": 8.296609051326033e-05, + "loss": 0.006, + "step": 4440 + }, + { + "epoch": 1.1721705886230307, + "grad_norm": 0.0019736960530281067, + "learning_rate": 8.283414698509039e-05, + "loss": 0.0078, + "step": 4445 + }, + { + "epoch": 1.1734888932832377, + "grad_norm": 0.012957746163010597, + "learning_rate": 8.270220345692044e-05, + "loss": 0.002, + "step": 4450 + }, + { + "epoch": 1.1748071979434447, + "grad_norm": 0.010877869091928005, + "learning_rate": 8.25702599287505e-05, + "loss": 0.0237, + "step": 4455 + }, + { + "epoch": 1.1761255026036517, + "grad_norm": 0.005947659723460674, + "learning_rate": 8.243831640058055e-05, + "loss": 0.0341, + "step": 4460 + }, + { + "epoch": 1.1774438072638587, + "grad_norm": 0.0005026470171287656, + "learning_rate": 8.230637287241061e-05, + "loss": 0.0033, + "step": 4465 + }, + { + "epoch": 1.1787621119240657, + "grad_norm": 0.022054588422179222, + "learning_rate": 8.217442934424066e-05, + "loss": 0.0042, + "step": 4470 + }, + { + "epoch": 1.1800804165842727, + "grad_norm": 0.7929030656814575, + "learning_rate": 8.204248581607072e-05, + "loss": 0.0076, + "step": 4475 + }, + { + "epoch": 1.1813987212444796, + "grad_norm": 0.39052629470825195, + "learning_rate": 8.191054228790078e-05, + "loss": 0.0228, + "step": 4480 + }, + { + "epoch": 1.1827170259046866, + "grad_norm": 0.007177622988820076, + "learning_rate": 8.177859875973084e-05, + "loss": 0.01, + "step": 4485 + }, + { + "epoch": 1.1840353305648936, + "grad_norm": 0.006175135262310505, + "learning_rate": 8.16466552315609e-05, + "loss": 0.0037, + "step": 4490 + }, + { + "epoch": 1.1853536352251006, + "grad_norm": 0.0356481671333313, + "learning_rate": 8.151471170339096e-05, + "loss": 0.0024, + "step": 4495 + }, + { + "epoch": 1.1866719398853074, + "grad_norm": 0.19069480895996094, + "learning_rate": 8.138276817522101e-05, + "loss": 0.0048, + "step": 4500 + }, + { + "epoch": 1.1866719398853074, + "eval_loss": 0.026386437937617302, + "eval_runtime": 452.2896, + "eval_samples_per_second": 7.455, + "eval_steps_per_second": 3.728, + "step": 4500 + }, + { + "epoch": 1.1879902445455144, + "grad_norm": 0.002254961524158716, + "learning_rate": 8.125082464705107e-05, + "loss": 0.0014, + "step": 4505 + }, + { + "epoch": 1.1893085492057214, + "grad_norm": 0.8026870489120483, + "learning_rate": 8.111888111888112e-05, + "loss": 0.0411, + "step": 4510 + }, + { + "epoch": 1.1906268538659284, + "grad_norm": 0.47328072786331177, + "learning_rate": 8.098693759071118e-05, + "loss": 0.0271, + "step": 4515 + }, + { + "epoch": 1.1919451585261354, + "grad_norm": 0.4888288676738739, + "learning_rate": 8.085499406254123e-05, + "loss": 0.039, + "step": 4520 + }, + { + "epoch": 1.1932634631863424, + "grad_norm": 0.000925812462810427, + "learning_rate": 8.072305053437129e-05, + "loss": 0.0461, + "step": 4525 + }, + { + "epoch": 1.1945817678465493, + "grad_norm": 0.12472371757030487, + "learning_rate": 8.059110700620135e-05, + "loss": 0.0037, + "step": 4530 + }, + { + "epoch": 1.1959000725067563, + "grad_norm": 0.002875336678698659, + "learning_rate": 8.04591634780314e-05, + "loss": 0.0425, + "step": 4535 + }, + { + "epoch": 1.1972183771669633, + "grad_norm": 0.042056187987327576, + "learning_rate": 8.032721994986147e-05, + "loss": 0.0068, + "step": 4540 + }, + { + "epoch": 1.1985366818271703, + "grad_norm": 0.157605841755867, + "learning_rate": 8.019527642169153e-05, + "loss": 0.0179, + "step": 4545 + }, + { + "epoch": 1.1998549864873773, + "grad_norm": 0.005153563339263201, + "learning_rate": 8.006333289352158e-05, + "loss": 0.0045, + "step": 4550 + }, + { + "epoch": 1.201173291147584, + "grad_norm": 0.02541598491370678, + "learning_rate": 7.993138936535164e-05, + "loss": 0.0041, + "step": 4555 + }, + { + "epoch": 1.2024915958077913, + "grad_norm": 0.04266195371747017, + "learning_rate": 7.979944583718168e-05, + "loss": 0.0121, + "step": 4560 + }, + { + "epoch": 1.203809900467998, + "grad_norm": 0.36108532547950745, + "learning_rate": 7.966750230901175e-05, + "loss": 0.0147, + "step": 4565 + }, + { + "epoch": 1.205128205128205, + "grad_norm": 0.40405452251434326, + "learning_rate": 7.95355587808418e-05, + "loss": 0.0056, + "step": 4570 + }, + { + "epoch": 1.206446509788412, + "grad_norm": 0.030422702431678772, + "learning_rate": 7.940361525267186e-05, + "loss": 0.0055, + "step": 4575 + }, + { + "epoch": 1.207764814448619, + "grad_norm": 0.014555396512150764, + "learning_rate": 7.927167172450192e-05, + "loss": 0.0029, + "step": 4580 + }, + { + "epoch": 1.209083119108826, + "grad_norm": 0.33962950110435486, + "learning_rate": 7.913972819633197e-05, + "loss": 0.0191, + "step": 4585 + }, + { + "epoch": 1.210401423769033, + "grad_norm": 0.040150560438632965, + "learning_rate": 7.900778466816203e-05, + "loss": 0.0096, + "step": 4590 + }, + { + "epoch": 1.21171972842924, + "grad_norm": 0.2968510091304779, + "learning_rate": 7.88758411399921e-05, + "loss": 0.0311, + "step": 4595 + }, + { + "epoch": 1.213038033089447, + "grad_norm": 0.04709814116358757, + "learning_rate": 7.874389761182215e-05, + "loss": 0.0175, + "step": 4600 + }, + { + "epoch": 1.214356337749654, + "grad_norm": 0.1379537284374237, + "learning_rate": 7.861195408365221e-05, + "loss": 0.02, + "step": 4605 + }, + { + "epoch": 1.215674642409861, + "grad_norm": 0.018291711807250977, + "learning_rate": 7.848001055548225e-05, + "loss": 0.003, + "step": 4610 + }, + { + "epoch": 1.216992947070068, + "grad_norm": 0.041676126420497894, + "learning_rate": 7.83480670273123e-05, + "loss": 0.0054, + "step": 4615 + }, + { + "epoch": 1.2183112517302748, + "grad_norm": 0.0013747498160228133, + "learning_rate": 7.821612349914237e-05, + "loss": 0.0132, + "step": 4620 + }, + { + "epoch": 1.2196295563904818, + "grad_norm": 0.0050489697605371475, + "learning_rate": 7.808417997097243e-05, + "loss": 0.0272, + "step": 4625 + }, + { + "epoch": 1.2209478610506888, + "grad_norm": 0.017974581569433212, + "learning_rate": 7.795223644280249e-05, + "loss": 0.0037, + "step": 4630 + }, + { + "epoch": 1.2222661657108957, + "grad_norm": 0.001916698063723743, + "learning_rate": 7.782029291463254e-05, + "loss": 0.002, + "step": 4635 + }, + { + "epoch": 1.2235844703711027, + "grad_norm": 0.05344574153423309, + "learning_rate": 7.76883493864626e-05, + "loss": 0.0114, + "step": 4640 + }, + { + "epoch": 1.2249027750313097, + "grad_norm": 0.22823786735534668, + "learning_rate": 7.755640585829265e-05, + "loss": 0.0296, + "step": 4645 + }, + { + "epoch": 1.2262210796915167, + "grad_norm": 0.02051074244081974, + "learning_rate": 7.742446233012272e-05, + "loss": 0.0037, + "step": 4650 + }, + { + "epoch": 1.2275393843517237, + "grad_norm": 0.9797061681747437, + "learning_rate": 7.729251880195276e-05, + "loss": 0.011, + "step": 4655 + }, + { + "epoch": 1.2288576890119307, + "grad_norm": 0.0017285927897319198, + "learning_rate": 7.716057527378282e-05, + "loss": 0.0224, + "step": 4660 + }, + { + "epoch": 1.2301759936721377, + "grad_norm": 0.021783018484711647, + "learning_rate": 7.702863174561288e-05, + "loss": 0.0174, + "step": 4665 + }, + { + "epoch": 1.2314942983323447, + "grad_norm": 0.00763307698071003, + "learning_rate": 7.689668821744293e-05, + "loss": 0.0516, + "step": 4670 + }, + { + "epoch": 1.2328126029925515, + "grad_norm": 0.32605209946632385, + "learning_rate": 7.676474468927299e-05, + "loss": 0.0301, + "step": 4675 + }, + { + "epoch": 1.2341309076527585, + "grad_norm": 1.2027722597122192, + "learning_rate": 7.663280116110306e-05, + "loss": 0.0474, + "step": 4680 + }, + { + "epoch": 1.2354492123129655, + "grad_norm": 0.10201717168092728, + "learning_rate": 7.650085763293311e-05, + "loss": 0.0144, + "step": 4685 + }, + { + "epoch": 1.2367675169731724, + "grad_norm": 0.013835664838552475, + "learning_rate": 7.636891410476317e-05, + "loss": 0.0024, + "step": 4690 + }, + { + "epoch": 1.2380858216333794, + "grad_norm": 0.005699916277080774, + "learning_rate": 7.623697057659322e-05, + "loss": 0.0089, + "step": 4695 + }, + { + "epoch": 1.2394041262935864, + "grad_norm": 0.16583332419395447, + "learning_rate": 7.610502704842328e-05, + "loss": 0.019, + "step": 4700 + }, + { + "epoch": 1.2407224309537934, + "grad_norm": 0.2734023332595825, + "learning_rate": 7.597308352025333e-05, + "loss": 0.0041, + "step": 4705 + }, + { + "epoch": 1.2420407356140004, + "grad_norm": 0.04209504276514053, + "learning_rate": 7.584113999208339e-05, + "loss": 0.0292, + "step": 4710 + }, + { + "epoch": 1.2433590402742074, + "grad_norm": 0.0303195733577013, + "learning_rate": 7.570919646391345e-05, + "loss": 0.0019, + "step": 4715 + }, + { + "epoch": 1.2446773449344144, + "grad_norm": 0.014011899940669537, + "learning_rate": 7.55772529357435e-05, + "loss": 0.0236, + "step": 4720 + }, + { + "epoch": 1.2459956495946214, + "grad_norm": 0.37838876247406006, + "learning_rate": 7.544530940757356e-05, + "loss": 0.0081, + "step": 4725 + }, + { + "epoch": 1.2473139542548284, + "grad_norm": 0.003717717481777072, + "learning_rate": 7.531336587940361e-05, + "loss": 0.0036, + "step": 4730 + }, + { + "epoch": 1.2486322589150354, + "grad_norm": 1.2284752130508423, + "learning_rate": 7.518142235123368e-05, + "loss": 0.0089, + "step": 4735 + }, + { + "epoch": 1.2499505635752421, + "grad_norm": 0.015356095507740974, + "learning_rate": 7.504947882306374e-05, + "loss": 0.0074, + "step": 4740 + }, + { + "epoch": 1.2512688682354491, + "grad_norm": 0.0020383282098919153, + "learning_rate": 7.49175352948938e-05, + "loss": 0.0444, + "step": 4745 + }, + { + "epoch": 1.2525871728956561, + "grad_norm": 0.006680132355540991, + "learning_rate": 7.478559176672385e-05, + "loss": 0.009, + "step": 4750 + }, + { + "epoch": 1.2539054775558631, + "grad_norm": 0.01650019735097885, + "learning_rate": 7.465364823855389e-05, + "loss": 0.0022, + "step": 4755 + }, + { + "epoch": 1.2552237822160701, + "grad_norm": 0.009536102414131165, + "learning_rate": 7.452170471038396e-05, + "loss": 0.0026, + "step": 4760 + }, + { + "epoch": 1.256542086876277, + "grad_norm": 0.04677430912852287, + "learning_rate": 7.438976118221402e-05, + "loss": 0.004, + "step": 4765 + }, + { + "epoch": 1.257860391536484, + "grad_norm": 0.007777783088386059, + "learning_rate": 7.425781765404407e-05, + "loss": 0.0112, + "step": 4770 + }, + { + "epoch": 1.259178696196691, + "grad_norm": 0.03724197298288345, + "learning_rate": 7.412587412587413e-05, + "loss": 0.0065, + "step": 4775 + }, + { + "epoch": 1.260497000856898, + "grad_norm": 0.0023958412930369377, + "learning_rate": 7.399393059770418e-05, + "loss": 0.0238, + "step": 4780 + }, + { + "epoch": 1.261815305517105, + "grad_norm": 0.0036889975890517235, + "learning_rate": 7.386198706953424e-05, + "loss": 0.0012, + "step": 4785 + }, + { + "epoch": 1.263133610177312, + "grad_norm": 0.0009220903157256544, + "learning_rate": 7.373004354136431e-05, + "loss": 0.0017, + "step": 4790 + }, + { + "epoch": 1.2644519148375188, + "grad_norm": 0.0033395602367818356, + "learning_rate": 7.359810001319436e-05, + "loss": 0.0474, + "step": 4795 + }, + { + "epoch": 1.265770219497726, + "grad_norm": 0.004093261435627937, + "learning_rate": 7.346615648502442e-05, + "loss": 0.0025, + "step": 4800 + }, + { + "epoch": 1.2670885241579328, + "grad_norm": 0.004395488649606705, + "learning_rate": 7.333421295685446e-05, + "loss": 0.0011, + "step": 4805 + }, + { + "epoch": 1.2684068288181398, + "grad_norm": 0.024034051224589348, + "learning_rate": 7.320226942868452e-05, + "loss": 0.0027, + "step": 4810 + }, + { + "epoch": 1.2697251334783468, + "grad_norm": 0.9501499533653259, + "learning_rate": 7.307032590051459e-05, + "loss": 0.0279, + "step": 4815 + }, + { + "epoch": 1.2710434381385538, + "grad_norm": 0.008805549703538418, + "learning_rate": 7.293838237234464e-05, + "loss": 0.0403, + "step": 4820 + }, + { + "epoch": 1.2723617427987608, + "grad_norm": 0.01750873774290085, + "learning_rate": 7.28064388441747e-05, + "loss": 0.0571, + "step": 4825 + }, + { + "epoch": 1.2736800474589678, + "grad_norm": 0.004490260500460863, + "learning_rate": 7.267449531600475e-05, + "loss": 0.0269, + "step": 4830 + }, + { + "epoch": 1.2749983521191748, + "grad_norm": 0.07510064542293549, + "learning_rate": 7.254255178783481e-05, + "loss": 0.0123, + "step": 4835 + }, + { + "epoch": 1.2763166567793818, + "grad_norm": 0.039783038198947906, + "learning_rate": 7.241060825966486e-05, + "loss": 0.0137, + "step": 4840 + }, + { + "epoch": 1.2776349614395888, + "grad_norm": 0.019004900008440018, + "learning_rate": 7.227866473149493e-05, + "loss": 0.0047, + "step": 4845 + }, + { + "epoch": 1.2789532660997955, + "grad_norm": 0.04813052713871002, + "learning_rate": 7.214672120332499e-05, + "loss": 0.0021, + "step": 4850 + }, + { + "epoch": 1.2802715707600028, + "grad_norm": 0.00835048221051693, + "learning_rate": 7.201477767515503e-05, + "loss": 0.0014, + "step": 4855 + }, + { + "epoch": 1.2815898754202095, + "grad_norm": 0.008609198965132236, + "learning_rate": 7.188283414698509e-05, + "loss": 0.0219, + "step": 4860 + }, + { + "epoch": 1.2829081800804165, + "grad_norm": 0.007337458431720734, + "learning_rate": 7.175089061881514e-05, + "loss": 0.0014, + "step": 4865 + }, + { + "epoch": 1.2842264847406235, + "grad_norm": 0.0032645913306623697, + "learning_rate": 7.161894709064521e-05, + "loss": 0.0026, + "step": 4870 + }, + { + "epoch": 1.2855447894008305, + "grad_norm": 0.27384671568870544, + "learning_rate": 7.148700356247527e-05, + "loss": 0.0227, + "step": 4875 + }, + { + "epoch": 1.2868630940610375, + "grad_norm": 0.03584875538945198, + "learning_rate": 7.135506003430532e-05, + "loss": 0.0299, + "step": 4880 + }, + { + "epoch": 1.2881813987212445, + "grad_norm": 0.03482440486550331, + "learning_rate": 7.122311650613538e-05, + "loss": 0.0125, + "step": 4885 + }, + { + "epoch": 1.2894997033814515, + "grad_norm": 0.005974395200610161, + "learning_rate": 7.109117297796543e-05, + "loss": 0.0029, + "step": 4890 + }, + { + "epoch": 1.2908180080416585, + "grad_norm": 0.01820153370499611, + "learning_rate": 7.095922944979549e-05, + "loss": 0.0254, + "step": 4895 + }, + { + "epoch": 1.2921363127018655, + "grad_norm": 0.1733965277671814, + "learning_rate": 7.082728592162555e-05, + "loss": 0.028, + "step": 4900 + }, + { + "epoch": 1.2934546173620725, + "grad_norm": 1.3017303943634033, + "learning_rate": 7.06953423934556e-05, + "loss": 0.0213, + "step": 4905 + }, + { + "epoch": 1.2947729220222794, + "grad_norm": 0.01360877975821495, + "learning_rate": 7.056339886528566e-05, + "loss": 0.0039, + "step": 4910 + }, + { + "epoch": 1.2960912266824862, + "grad_norm": 0.01503999624401331, + "learning_rate": 7.043145533711571e-05, + "loss": 0.0102, + "step": 4915 + }, + { + "epoch": 1.2974095313426934, + "grad_norm": 0.2200804352760315, + "learning_rate": 7.029951180894577e-05, + "loss": 0.0461, + "step": 4920 + }, + { + "epoch": 1.2987278360029002, + "grad_norm": 0.08512946963310242, + "learning_rate": 7.016756828077582e-05, + "loss": 0.0066, + "step": 4925 + }, + { + "epoch": 1.3000461406631072, + "grad_norm": 0.08296570926904678, + "learning_rate": 7.00356247526059e-05, + "loss": 0.0223, + "step": 4930 + }, + { + "epoch": 1.3013644453233142, + "grad_norm": 0.008866079151630402, + "learning_rate": 6.990368122443595e-05, + "loss": 0.0032, + "step": 4935 + }, + { + "epoch": 1.3026827499835212, + "grad_norm": 0.024493014439940453, + "learning_rate": 6.9771737696266e-05, + "loss": 0.0128, + "step": 4940 + }, + { + "epoch": 1.3040010546437282, + "grad_norm": 0.08965341746807098, + "learning_rate": 6.963979416809606e-05, + "loss": 0.028, + "step": 4945 + }, + { + "epoch": 1.3053193593039352, + "grad_norm": 0.023156631737947464, + "learning_rate": 6.950785063992612e-05, + "loss": 0.0187, + "step": 4950 + }, + { + "epoch": 1.3066376639641422, + "grad_norm": 0.18552155792713165, + "learning_rate": 6.937590711175617e-05, + "loss": 0.0424, + "step": 4955 + }, + { + "epoch": 1.3079559686243492, + "grad_norm": 0.02200198918581009, + "learning_rate": 6.924396358358623e-05, + "loss": 0.0148, + "step": 4960 + }, + { + "epoch": 1.3092742732845561, + "grad_norm": 0.00568364467471838, + "learning_rate": 6.911202005541628e-05, + "loss": 0.0199, + "step": 4965 + }, + { + "epoch": 1.310592577944763, + "grad_norm": 0.021591177210211754, + "learning_rate": 6.898007652724634e-05, + "loss": 0.0092, + "step": 4970 + }, + { + "epoch": 1.3119108826049701, + "grad_norm": 0.327177494764328, + "learning_rate": 6.88481329990764e-05, + "loss": 0.0047, + "step": 4975 + }, + { + "epoch": 1.313229187265177, + "grad_norm": 0.024512887001037598, + "learning_rate": 6.871618947090645e-05, + "loss": 0.0046, + "step": 4980 + }, + { + "epoch": 1.314547491925384, + "grad_norm": 0.05725006014108658, + "learning_rate": 6.858424594273652e-05, + "loss": 0.0227, + "step": 4985 + }, + { + "epoch": 1.3158657965855909, + "grad_norm": 0.011280277743935585, + "learning_rate": 6.845230241456658e-05, + "loss": 0.0056, + "step": 4990 + }, + { + "epoch": 1.3171841012457979, + "grad_norm": 0.022504402324557304, + "learning_rate": 6.832035888639663e-05, + "loss": 0.0029, + "step": 4995 + }, + { + "epoch": 1.3185024059060049, + "grad_norm": 0.02168826013803482, + "learning_rate": 6.818841535822669e-05, + "loss": 0.0198, + "step": 5000 + }, + { + "epoch": 1.3185024059060049, + "eval_loss": 0.025039294734597206, + "eval_runtime": 452.1097, + "eval_samples_per_second": 7.458, + "eval_steps_per_second": 3.729, + "step": 5000 + }, + { + "epoch": 1.3198207105662119, + "grad_norm": 0.0064329709857702255, + "learning_rate": 6.805647183005673e-05, + "loss": 0.0299, + "step": 5005 + }, + { + "epoch": 1.3211390152264189, + "grad_norm": 0.00267885928042233, + "learning_rate": 6.79245283018868e-05, + "loss": 0.0065, + "step": 5010 + }, + { + "epoch": 1.3224573198866258, + "grad_norm": 0.6842889189720154, + "learning_rate": 6.779258477371685e-05, + "loss": 0.008, + "step": 5015 + }, + { + "epoch": 1.3237756245468328, + "grad_norm": 0.002985635306686163, + "learning_rate": 6.766064124554691e-05, + "loss": 0.0119, + "step": 5020 + }, + { + "epoch": 1.3250939292070396, + "grad_norm": 0.019304940477013588, + "learning_rate": 6.752869771737696e-05, + "loss": 0.0041, + "step": 5025 + }, + { + "epoch": 1.3264122338672468, + "grad_norm": 0.011305035091936588, + "learning_rate": 6.739675418920702e-05, + "loss": 0.0031, + "step": 5030 + }, + { + "epoch": 1.3277305385274536, + "grad_norm": 0.006184784695506096, + "learning_rate": 6.726481066103708e-05, + "loss": 0.0081, + "step": 5035 + }, + { + "epoch": 1.3290488431876606, + "grad_norm": 0.0073184361681342125, + "learning_rate": 6.713286713286715e-05, + "loss": 0.0202, + "step": 5040 + }, + { + "epoch": 1.3303671478478676, + "grad_norm": 0.006566181313246489, + "learning_rate": 6.70009236046972e-05, + "loss": 0.0052, + "step": 5045 + }, + { + "epoch": 1.3316854525080746, + "grad_norm": 0.31427526473999023, + "learning_rate": 6.686898007652726e-05, + "loss": 0.017, + "step": 5050 + }, + { + "epoch": 1.3330037571682816, + "grad_norm": 0.005085447803139687, + "learning_rate": 6.67370365483573e-05, + "loss": 0.009, + "step": 5055 + }, + { + "epoch": 1.3343220618284886, + "grad_norm": 0.2745366096496582, + "learning_rate": 6.660509302018735e-05, + "loss": 0.0119, + "step": 5060 + }, + { + "epoch": 1.3356403664886956, + "grad_norm": 0.2871796786785126, + "learning_rate": 6.647314949201742e-05, + "loss": 0.0158, + "step": 5065 + }, + { + "epoch": 1.3369586711489025, + "grad_norm": 0.2774186134338379, + "learning_rate": 6.634120596384748e-05, + "loss": 0.0084, + "step": 5070 + }, + { + "epoch": 1.3382769758091095, + "grad_norm": 0.013278775848448277, + "learning_rate": 6.620926243567753e-05, + "loss": 0.0111, + "step": 5075 + }, + { + "epoch": 1.3395952804693165, + "grad_norm": 0.01614517532289028, + "learning_rate": 6.607731890750759e-05, + "loss": 0.0066, + "step": 5080 + }, + { + "epoch": 1.3409135851295235, + "grad_norm": 0.0037789656780660152, + "learning_rate": 6.594537537933765e-05, + "loss": 0.0142, + "step": 5085 + }, + { + "epoch": 1.3422318897897303, + "grad_norm": 0.03221861273050308, + "learning_rate": 6.58134318511677e-05, + "loss": 0.0155, + "step": 5090 + }, + { + "epoch": 1.3435501944499375, + "grad_norm": 0.005637989845126867, + "learning_rate": 6.568148832299776e-05, + "loss": 0.0022, + "step": 5095 + }, + { + "epoch": 1.3448684991101443, + "grad_norm": 0.0017844432732090354, + "learning_rate": 6.554954479482783e-05, + "loss": 0.0217, + "step": 5100 + }, + { + "epoch": 1.3461868037703513, + "grad_norm": 0.08099021762609482, + "learning_rate": 6.541760126665787e-05, + "loss": 0.0222, + "step": 5105 + }, + { + "epoch": 1.3475051084305583, + "grad_norm": 0.011909045279026031, + "learning_rate": 6.528565773848792e-05, + "loss": 0.0058, + "step": 5110 + }, + { + "epoch": 1.3488234130907653, + "grad_norm": 0.7332578301429749, + "learning_rate": 6.515371421031798e-05, + "loss": 0.0286, + "step": 5115 + }, + { + "epoch": 1.3501417177509722, + "grad_norm": 0.3415885865688324, + "learning_rate": 6.502177068214804e-05, + "loss": 0.1191, + "step": 5120 + }, + { + "epoch": 1.3514600224111792, + "grad_norm": 0.00904211588203907, + "learning_rate": 6.48898271539781e-05, + "loss": 0.0043, + "step": 5125 + }, + { + "epoch": 1.3527783270713862, + "grad_norm": 0.1978830248117447, + "learning_rate": 6.475788362580816e-05, + "loss": 0.0316, + "step": 5130 + }, + { + "epoch": 1.3540966317315932, + "grad_norm": 0.10229042172431946, + "learning_rate": 6.462594009763822e-05, + "loss": 0.0194, + "step": 5135 + }, + { + "epoch": 1.3554149363918002, + "grad_norm": 0.4457210600376129, + "learning_rate": 6.449399656946827e-05, + "loss": 0.0276, + "step": 5140 + }, + { + "epoch": 1.356733241052007, + "grad_norm": 0.023706572130322456, + "learning_rate": 6.436205304129833e-05, + "loss": 0.0163, + "step": 5145 + }, + { + "epoch": 1.3580515457122142, + "grad_norm": 1.166896939277649, + "learning_rate": 6.423010951312838e-05, + "loss": 0.0189, + "step": 5150 + }, + { + "epoch": 1.359369850372421, + "grad_norm": 0.0016115796752274036, + "learning_rate": 6.409816598495844e-05, + "loss": 0.0191, + "step": 5155 + }, + { + "epoch": 1.360688155032628, + "grad_norm": 0.00786682777106762, + "learning_rate": 6.39662224567885e-05, + "loss": 0.0119, + "step": 5160 + }, + { + "epoch": 1.362006459692835, + "grad_norm": 1.042732834815979, + "learning_rate": 6.383427892861855e-05, + "loss": 0.0497, + "step": 5165 + }, + { + "epoch": 1.363324764353042, + "grad_norm": 0.007983304560184479, + "learning_rate": 6.37023354004486e-05, + "loss": 0.044, + "step": 5170 + }, + { + "epoch": 1.364643069013249, + "grad_norm": 0.009767642244696617, + "learning_rate": 6.357039187227866e-05, + "loss": 0.0405, + "step": 5175 + }, + { + "epoch": 1.365961373673456, + "grad_norm": 0.03164628520607948, + "learning_rate": 6.343844834410873e-05, + "loss": 0.0138, + "step": 5180 + }, + { + "epoch": 1.367279678333663, + "grad_norm": 0.004159921780228615, + "learning_rate": 6.330650481593879e-05, + "loss": 0.0045, + "step": 5185 + }, + { + "epoch": 1.36859798299387, + "grad_norm": 0.004395391326397657, + "learning_rate": 6.317456128776884e-05, + "loss": 0.0046, + "step": 5190 + }, + { + "epoch": 1.369916287654077, + "grad_norm": 0.011886746622622013, + "learning_rate": 6.30426177595989e-05, + "loss": 0.0064, + "step": 5195 + }, + { + "epoch": 1.371234592314284, + "grad_norm": 0.2259266972541809, + "learning_rate": 6.291067423142895e-05, + "loss": 0.0076, + "step": 5200 + }, + { + "epoch": 1.372552896974491, + "grad_norm": 0.01407301053404808, + "learning_rate": 6.277873070325901e-05, + "loss": 0.0201, + "step": 5205 + }, + { + "epoch": 1.3738712016346977, + "grad_norm": 0.00911578256636858, + "learning_rate": 6.264678717508906e-05, + "loss": 0.0164, + "step": 5210 + }, + { + "epoch": 1.3751895062949049, + "grad_norm": 0.20968014001846313, + "learning_rate": 6.251484364691912e-05, + "loss": 0.0075, + "step": 5215 + }, + { + "epoch": 1.3765078109551117, + "grad_norm": 0.008801166899502277, + "learning_rate": 6.238290011874918e-05, + "loss": 0.0068, + "step": 5220 + }, + { + "epoch": 1.3778261156153186, + "grad_norm": 0.007181806955486536, + "learning_rate": 6.225095659057923e-05, + "loss": 0.0136, + "step": 5225 + }, + { + "epoch": 1.3791444202755256, + "grad_norm": 0.7527109980583191, + "learning_rate": 6.211901306240929e-05, + "loss": 0.0287, + "step": 5230 + }, + { + "epoch": 1.3804627249357326, + "grad_norm": 0.039015207439661026, + "learning_rate": 6.198706953423936e-05, + "loss": 0.0326, + "step": 5235 + }, + { + "epoch": 1.3817810295959396, + "grad_norm": 0.021076606586575508, + "learning_rate": 6.185512600606941e-05, + "loss": 0.0191, + "step": 5240 + }, + { + "epoch": 1.3830993342561466, + "grad_norm": 0.016630731523036957, + "learning_rate": 6.172318247789947e-05, + "loss": 0.0131, + "step": 5245 + }, + { + "epoch": 1.3844176389163536, + "grad_norm": 0.011133644729852676, + "learning_rate": 6.159123894972952e-05, + "loss": 0.0029, + "step": 5250 + }, + { + "epoch": 1.3857359435765606, + "grad_norm": 0.6434677243232727, + "learning_rate": 6.145929542155957e-05, + "loss": 0.0091, + "step": 5255 + }, + { + "epoch": 1.3870542482367676, + "grad_norm": 0.051020298153162, + "learning_rate": 6.132735189338964e-05, + "loss": 0.0086, + "step": 5260 + }, + { + "epoch": 1.3883725528969744, + "grad_norm": 0.016413932666182518, + "learning_rate": 6.119540836521969e-05, + "loss": 0.0061, + "step": 5265 + }, + { + "epoch": 1.3896908575571816, + "grad_norm": 0.005769540090113878, + "learning_rate": 6.106346483704975e-05, + "loss": 0.0027, + "step": 5270 + }, + { + "epoch": 1.3910091622173884, + "grad_norm": 0.06687796860933304, + "learning_rate": 6.09315213088798e-05, + "loss": 0.0423, + "step": 5275 + }, + { + "epoch": 1.3923274668775953, + "grad_norm": 0.005641553085297346, + "learning_rate": 6.079957778070986e-05, + "loss": 0.0353, + "step": 5280 + }, + { + "epoch": 1.3936457715378023, + "grad_norm": 0.04460568353533745, + "learning_rate": 6.066763425253992e-05, + "loss": 0.0041, + "step": 5285 + }, + { + "epoch": 1.3949640761980093, + "grad_norm": 0.0387534461915493, + "learning_rate": 6.0535690724369976e-05, + "loss": 0.006, + "step": 5290 + }, + { + "epoch": 1.3962823808582163, + "grad_norm": 0.010292598977684975, + "learning_rate": 6.040374719620003e-05, + "loss": 0.0038, + "step": 5295 + }, + { + "epoch": 1.3976006855184233, + "grad_norm": 0.3646155297756195, + "learning_rate": 6.0271803668030094e-05, + "loss": 0.0111, + "step": 5300 + }, + { + "epoch": 1.3989189901786303, + "grad_norm": 0.022035539150238037, + "learning_rate": 6.0139860139860136e-05, + "loss": 0.0507, + "step": 5305 + }, + { + "epoch": 1.4002372948388373, + "grad_norm": 0.003314939560368657, + "learning_rate": 6.00079166116902e-05, + "loss": 0.0132, + "step": 5310 + }, + { + "epoch": 1.4015555994990443, + "grad_norm": 0.0838267058134079, + "learning_rate": 5.9875973083520254e-05, + "loss": 0.0105, + "step": 5315 + }, + { + "epoch": 1.4028739041592513, + "grad_norm": 0.009368584491312504, + "learning_rate": 5.974402955535031e-05, + "loss": 0.0026, + "step": 5320 + }, + { + "epoch": 1.4041922088194583, + "grad_norm": 0.031248098239302635, + "learning_rate": 5.961208602718037e-05, + "loss": 0.0151, + "step": 5325 + }, + { + "epoch": 1.405510513479665, + "grad_norm": 0.06447605788707733, + "learning_rate": 5.948014249901043e-05, + "loss": 0.0219, + "step": 5330 + }, + { + "epoch": 1.4068288181398723, + "grad_norm": 0.010814374312758446, + "learning_rate": 5.9348198970840484e-05, + "loss": 0.0038, + "step": 5335 + }, + { + "epoch": 1.408147122800079, + "grad_norm": 0.6235967874526978, + "learning_rate": 5.9216255442670546e-05, + "loss": 0.0354, + "step": 5340 + }, + { + "epoch": 1.409465427460286, + "grad_norm": 0.026741521432995796, + "learning_rate": 5.90843119145006e-05, + "loss": 0.0032, + "step": 5345 + }, + { + "epoch": 1.410783732120493, + "grad_norm": 0.019413433969020844, + "learning_rate": 5.895236838633066e-05, + "loss": 0.0216, + "step": 5350 + }, + { + "epoch": 1.4121020367807, + "grad_norm": 0.0735543966293335, + "learning_rate": 5.8820424858160706e-05, + "loss": 0.0033, + "step": 5355 + }, + { + "epoch": 1.413420341440907, + "grad_norm": 0.005189546383917332, + "learning_rate": 5.868848132999076e-05, + "loss": 0.021, + "step": 5360 + }, + { + "epoch": 1.414738646101114, + "grad_norm": 0.21240335702896118, + "learning_rate": 5.8556537801820824e-05, + "loss": 0.0294, + "step": 5365 + }, + { + "epoch": 1.416056950761321, + "grad_norm": 0.010165920481085777, + "learning_rate": 5.842459427365088e-05, + "loss": 0.0021, + "step": 5370 + }, + { + "epoch": 1.417375255421528, + "grad_norm": 0.026774069294333458, + "learning_rate": 5.8292650745480936e-05, + "loss": 0.0299, + "step": 5375 + }, + { + "epoch": 1.418693560081735, + "grad_norm": 0.0019810455851256847, + "learning_rate": 5.816070721731099e-05, + "loss": 0.0029, + "step": 5380 + }, + { + "epoch": 1.4200118647419417, + "grad_norm": 0.038888879120349884, + "learning_rate": 5.8028763689141054e-05, + "loss": 0.0069, + "step": 5385 + }, + { + "epoch": 1.421330169402149, + "grad_norm": 0.016180936247110367, + "learning_rate": 5.789682016097111e-05, + "loss": 0.0032, + "step": 5390 + }, + { + "epoch": 1.4226484740623557, + "grad_norm": 0.01119404286146164, + "learning_rate": 5.7764876632801165e-05, + "loss": 0.0024, + "step": 5395 + }, + { + "epoch": 1.4239667787225627, + "grad_norm": 0.010486694052815437, + "learning_rate": 5.763293310463123e-05, + "loss": 0.0324, + "step": 5400 + }, + { + "epoch": 1.4252850833827697, + "grad_norm": 0.005453066434711218, + "learning_rate": 5.750098957646127e-05, + "loss": 0.0038, + "step": 5405 + }, + { + "epoch": 1.4266033880429767, + "grad_norm": 0.17556461691856384, + "learning_rate": 5.736904604829133e-05, + "loss": 0.0305, + "step": 5410 + }, + { + "epoch": 1.4279216927031837, + "grad_norm": 0.03074715845286846, + "learning_rate": 5.723710252012139e-05, + "loss": 0.003, + "step": 5415 + }, + { + "epoch": 1.4292399973633907, + "grad_norm": 1.7238941192626953, + "learning_rate": 5.710515899195144e-05, + "loss": 0.0254, + "step": 5420 + }, + { + "epoch": 1.4305583020235977, + "grad_norm": 0.012462320737540722, + "learning_rate": 5.6973215463781506e-05, + "loss": 0.0018, + "step": 5425 + }, + { + "epoch": 1.4318766066838047, + "grad_norm": 0.021576853469014168, + "learning_rate": 5.684127193561156e-05, + "loss": 0.0472, + "step": 5430 + }, + { + "epoch": 1.4331949113440117, + "grad_norm": 0.2862134575843811, + "learning_rate": 5.670932840744162e-05, + "loss": 0.0258, + "step": 5435 + }, + { + "epoch": 1.4345132160042184, + "grad_norm": 0.28419312834739685, + "learning_rate": 5.657738487927168e-05, + "loss": 0.0053, + "step": 5440 + }, + { + "epoch": 1.4358315206644257, + "grad_norm": 0.013650139793753624, + "learning_rate": 5.6445441351101735e-05, + "loss": 0.0126, + "step": 5445 + }, + { + "epoch": 1.4371498253246324, + "grad_norm": 0.01203097216784954, + "learning_rate": 5.631349782293179e-05, + "loss": 0.0076, + "step": 5450 + }, + { + "epoch": 1.4384681299848394, + "grad_norm": 0.0881054624915123, + "learning_rate": 5.618155429476184e-05, + "loss": 0.0178, + "step": 5455 + }, + { + "epoch": 1.4397864346450464, + "grad_norm": 0.5258516669273376, + "learning_rate": 5.6049610766591895e-05, + "loss": 0.0112, + "step": 5460 + }, + { + "epoch": 1.4411047393052534, + "grad_norm": 0.001202153041958809, + "learning_rate": 5.591766723842196e-05, + "loss": 0.0089, + "step": 5465 + }, + { + "epoch": 1.4424230439654604, + "grad_norm": 0.4498993456363678, + "learning_rate": 5.5785723710252014e-05, + "loss": 0.0252, + "step": 5470 + }, + { + "epoch": 1.4437413486256674, + "grad_norm": 0.17477644979953766, + "learning_rate": 5.565378018208207e-05, + "loss": 0.0169, + "step": 5475 + }, + { + "epoch": 1.4450596532858744, + "grad_norm": 0.019443338736891747, + "learning_rate": 5.552183665391213e-05, + "loss": 0.0019, + "step": 5480 + }, + { + "epoch": 1.4463779579460814, + "grad_norm": 0.005653039086610079, + "learning_rate": 5.538989312574219e-05, + "loss": 0.0231, + "step": 5485 + }, + { + "epoch": 1.4476962626062884, + "grad_norm": 0.01554112322628498, + "learning_rate": 5.525794959757224e-05, + "loss": 0.0167, + "step": 5490 + }, + { + "epoch": 1.4490145672664954, + "grad_norm": 0.044272180646657944, + "learning_rate": 5.5126006069402305e-05, + "loss": 0.007, + "step": 5495 + }, + { + "epoch": 1.4503328719267023, + "grad_norm": 0.014857172966003418, + "learning_rate": 5.499406254123236e-05, + "loss": 0.0045, + "step": 5500 + }, + { + "epoch": 1.4503328719267023, + "eval_loss": 0.02392147295176983, + "eval_runtime": 452.468, + "eval_samples_per_second": 7.452, + "eval_steps_per_second": 3.726, + "step": 5500 + }, + { + "epoch": 1.4516511765869091, + "grad_norm": 0.007390835788100958, + "learning_rate": 5.486211901306241e-05, + "loss": 0.0171, + "step": 5505 + }, + { + "epoch": 1.4529694812471163, + "grad_norm": 0.0050474610179662704, + "learning_rate": 5.4730175484892466e-05, + "loss": 0.004, + "step": 5510 + }, + { + "epoch": 1.454287785907323, + "grad_norm": 0.08066163957118988, + "learning_rate": 5.459823195672252e-05, + "loss": 0.0103, + "step": 5515 + }, + { + "epoch": 1.45560609056753, + "grad_norm": 0.0062376330606639385, + "learning_rate": 5.4466288428552584e-05, + "loss": 0.0066, + "step": 5520 + }, + { + "epoch": 1.456924395227737, + "grad_norm": 0.00711809890344739, + "learning_rate": 5.433434490038264e-05, + "loss": 0.003, + "step": 5525 + }, + { + "epoch": 1.458242699887944, + "grad_norm": 0.004010149277746677, + "learning_rate": 5.4202401372212695e-05, + "loss": 0.0231, + "step": 5530 + }, + { + "epoch": 1.459561004548151, + "grad_norm": 0.4791967272758484, + "learning_rate": 5.407045784404276e-05, + "loss": 0.0277, + "step": 5535 + }, + { + "epoch": 1.460879309208358, + "grad_norm": 0.03979189693927765, + "learning_rate": 5.393851431587281e-05, + "loss": 0.0033, + "step": 5540 + }, + { + "epoch": 1.462197613868565, + "grad_norm": 0.03331119939684868, + "learning_rate": 5.380657078770287e-05, + "loss": 0.0187, + "step": 5545 + }, + { + "epoch": 1.463515918528772, + "grad_norm": 0.0042802803218364716, + "learning_rate": 5.367462725953293e-05, + "loss": 0.0032, + "step": 5550 + }, + { + "epoch": 1.464834223188979, + "grad_norm": 0.05439918115735054, + "learning_rate": 5.354268373136297e-05, + "loss": 0.0043, + "step": 5555 + }, + { + "epoch": 1.4661525278491858, + "grad_norm": 0.042643506079912186, + "learning_rate": 5.3410740203193036e-05, + "loss": 0.0059, + "step": 5560 + }, + { + "epoch": 1.467470832509393, + "grad_norm": 0.023453116416931152, + "learning_rate": 5.327879667502309e-05, + "loss": 0.0043, + "step": 5565 + }, + { + "epoch": 1.4687891371695998, + "grad_norm": 0.037712760269641876, + "learning_rate": 5.314685314685315e-05, + "loss": 0.0033, + "step": 5570 + }, + { + "epoch": 1.4701074418298068, + "grad_norm": 1.0485608577728271, + "learning_rate": 5.301490961868321e-05, + "loss": 0.0489, + "step": 5575 + }, + { + "epoch": 1.4714257464900138, + "grad_norm": 0.004728829488158226, + "learning_rate": 5.2882966090513265e-05, + "loss": 0.0067, + "step": 5580 + }, + { + "epoch": 1.4727440511502208, + "grad_norm": 0.027893677353858948, + "learning_rate": 5.275102256234332e-05, + "loss": 0.0208, + "step": 5585 + }, + { + "epoch": 1.4740623558104278, + "grad_norm": 0.02256879396736622, + "learning_rate": 5.2619079034173377e-05, + "loss": 0.0036, + "step": 5590 + }, + { + "epoch": 1.4753806604706348, + "grad_norm": 0.12636558711528778, + "learning_rate": 5.248713550600344e-05, + "loss": 0.0046, + "step": 5595 + }, + { + "epoch": 1.4766989651308418, + "grad_norm": 0.000997041119262576, + "learning_rate": 5.235519197783348e-05, + "loss": 0.0101, + "step": 5600 + }, + { + "epoch": 1.4780172697910487, + "grad_norm": 0.023494020104408264, + "learning_rate": 5.2223248449663543e-05, + "loss": 0.0039, + "step": 5605 + }, + { + "epoch": 1.4793355744512557, + "grad_norm": 0.01525307446718216, + "learning_rate": 5.20913049214936e-05, + "loss": 0.021, + "step": 5610 + }, + { + "epoch": 1.4806538791114627, + "grad_norm": 0.0024215306621044874, + "learning_rate": 5.1959361393323655e-05, + "loss": 0.0017, + "step": 5615 + }, + { + "epoch": 1.4819721837716697, + "grad_norm": 1.4708061218261719, + "learning_rate": 5.182741786515372e-05, + "loss": 0.04, + "step": 5620 + }, + { + "epoch": 1.4832904884318765, + "grad_norm": 0.015033531002700329, + "learning_rate": 5.169547433698377e-05, + "loss": 0.0042, + "step": 5625 + }, + { + "epoch": 1.4846087930920837, + "grad_norm": 0.0035444959066808224, + "learning_rate": 5.156353080881383e-05, + "loss": 0.0087, + "step": 5630 + }, + { + "epoch": 1.4859270977522905, + "grad_norm": 0.010087919421494007, + "learning_rate": 5.143158728064389e-05, + "loss": 0.0158, + "step": 5635 + }, + { + "epoch": 1.4872454024124975, + "grad_norm": 0.05779251083731651, + "learning_rate": 5.129964375247395e-05, + "loss": 0.0157, + "step": 5640 + }, + { + "epoch": 1.4885637070727045, + "grad_norm": 0.14927980303764343, + "learning_rate": 5.1167700224304e-05, + "loss": 0.0257, + "step": 5645 + }, + { + "epoch": 1.4898820117329115, + "grad_norm": 0.004252352751791477, + "learning_rate": 5.103575669613405e-05, + "loss": 0.0198, + "step": 5650 + }, + { + "epoch": 1.4912003163931185, + "grad_norm": 0.0029206848703324795, + "learning_rate": 5.090381316796411e-05, + "loss": 0.0016, + "step": 5655 + }, + { + "epoch": 1.4925186210533254, + "grad_norm": 0.005047530401498079, + "learning_rate": 5.077186963979417e-05, + "loss": 0.0023, + "step": 5660 + }, + { + "epoch": 1.4938369257135324, + "grad_norm": 0.003732564626261592, + "learning_rate": 5.0639926111624225e-05, + "loss": 0.0336, + "step": 5665 + }, + { + "epoch": 1.4951552303737394, + "grad_norm": 0.3832889497280121, + "learning_rate": 5.050798258345428e-05, + "loss": 0.0476, + "step": 5670 + }, + { + "epoch": 1.4964735350339464, + "grad_norm": 0.06733009219169617, + "learning_rate": 5.037603905528434e-05, + "loss": 0.0044, + "step": 5675 + }, + { + "epoch": 1.4977918396941532, + "grad_norm": 0.008067069575190544, + "learning_rate": 5.02440955271144e-05, + "loss": 0.0035, + "step": 5680 + }, + { + "epoch": 1.4991101443543604, + "grad_norm": 0.01706300489604473, + "learning_rate": 5.0112151998944454e-05, + "loss": 0.0031, + "step": 5685 + }, + { + "epoch": 1.5004284490145672, + "grad_norm": 0.009932024404406548, + "learning_rate": 4.998020847077451e-05, + "loss": 0.0587, + "step": 5690 + }, + { + "epoch": 1.5017467536747744, + "grad_norm": 0.006488936021924019, + "learning_rate": 4.9848264942604566e-05, + "loss": 0.002, + "step": 5695 + }, + { + "epoch": 1.5030650583349812, + "grad_norm": 0.17488756775856018, + "learning_rate": 4.971632141443462e-05, + "loss": 0.0245, + "step": 5700 + }, + { + "epoch": 1.5043833629951882, + "grad_norm": 0.3327178359031677, + "learning_rate": 4.9584377886264684e-05, + "loss": 0.0404, + "step": 5705 + }, + { + "epoch": 1.5057016676553951, + "grad_norm": 0.18467263877391815, + "learning_rate": 4.945243435809474e-05, + "loss": 0.0248, + "step": 5710 + }, + { + "epoch": 1.5070199723156021, + "grad_norm": 0.020061776041984558, + "learning_rate": 4.9320490829924795e-05, + "loss": 0.0034, + "step": 5715 + }, + { + "epoch": 1.5083382769758091, + "grad_norm": 0.0005288647953420877, + "learning_rate": 4.918854730175485e-05, + "loss": 0.0076, + "step": 5720 + }, + { + "epoch": 1.5096565816360161, + "grad_norm": 0.007515576668083668, + "learning_rate": 4.9056603773584906e-05, + "loss": 0.004, + "step": 5725 + }, + { + "epoch": 1.5109748862962231, + "grad_norm": 0.05365758761763573, + "learning_rate": 4.892466024541497e-05, + "loss": 0.0222, + "step": 5730 + }, + { + "epoch": 1.51229319095643, + "grad_norm": 0.00572391040623188, + "learning_rate": 4.8792716717245025e-05, + "loss": 0.0132, + "step": 5735 + }, + { + "epoch": 1.513611495616637, + "grad_norm": 0.21178627014160156, + "learning_rate": 4.8660773189075073e-05, + "loss": 0.0417, + "step": 5740 + }, + { + "epoch": 1.5149298002768439, + "grad_norm": 0.0641486868262291, + "learning_rate": 4.8528829660905136e-05, + "loss": 0.011, + "step": 5745 + }, + { + "epoch": 1.516248104937051, + "grad_norm": 0.04451924189925194, + "learning_rate": 4.839688613273519e-05, + "loss": 0.012, + "step": 5750 + }, + { + "epoch": 1.5175664095972579, + "grad_norm": 0.019951259717345238, + "learning_rate": 4.826494260456525e-05, + "loss": 0.009, + "step": 5755 + }, + { + "epoch": 1.5188847142574649, + "grad_norm": 0.021919893100857735, + "learning_rate": 4.813299907639531e-05, + "loss": 0.0081, + "step": 5760 + }, + { + "epoch": 1.5202030189176718, + "grad_norm": 0.5730367302894592, + "learning_rate": 4.800105554822536e-05, + "loss": 0.0254, + "step": 5765 + }, + { + "epoch": 1.5215213235778788, + "grad_norm": 0.02501523122191429, + "learning_rate": 4.786911202005542e-05, + "loss": 0.0045, + "step": 5770 + }, + { + "epoch": 1.5228396282380858, + "grad_norm": 0.01574208028614521, + "learning_rate": 4.773716849188548e-05, + "loss": 0.0081, + "step": 5775 + }, + { + "epoch": 1.5241579328982928, + "grad_norm": 0.009626791812479496, + "learning_rate": 4.760522496371553e-05, + "loss": 0.0037, + "step": 5780 + }, + { + "epoch": 1.5254762375584998, + "grad_norm": 0.535539448261261, + "learning_rate": 4.747328143554559e-05, + "loss": 0.0149, + "step": 5785 + }, + { + "epoch": 1.5267945422187066, + "grad_norm": 0.004934845492243767, + "learning_rate": 4.7341337907375644e-05, + "loss": 0.0048, + "step": 5790 + }, + { + "epoch": 1.5281128468789138, + "grad_norm": 0.009070080704987049, + "learning_rate": 4.72093943792057e-05, + "loss": 0.0028, + "step": 5795 + }, + { + "epoch": 1.5294311515391206, + "grad_norm": 0.0040720063261687756, + "learning_rate": 4.707745085103576e-05, + "loss": 0.0016, + "step": 5800 + }, + { + "epoch": 1.5307494561993278, + "grad_norm": 0.45212000608444214, + "learning_rate": 4.694550732286582e-05, + "loss": 0.0111, + "step": 5805 + }, + { + "epoch": 1.5320677608595346, + "grad_norm": 0.024048497900366783, + "learning_rate": 4.681356379469587e-05, + "loss": 0.0149, + "step": 5810 + }, + { + "epoch": 1.5333860655197418, + "grad_norm": 0.11899136006832123, + "learning_rate": 4.668162026652593e-05, + "loss": 0.0034, + "step": 5815 + }, + { + "epoch": 1.5347043701799485, + "grad_norm": 0.011249657720327377, + "learning_rate": 4.6549676738355984e-05, + "loss": 0.0052, + "step": 5820 + }, + { + "epoch": 1.5360226748401555, + "grad_norm": 0.051634710282087326, + "learning_rate": 4.641773321018604e-05, + "loss": 0.0031, + "step": 5825 + }, + { + "epoch": 1.5373409795003625, + "grad_norm": 0.3726826012134552, + "learning_rate": 4.62857896820161e-05, + "loss": 0.0582, + "step": 5830 + }, + { + "epoch": 1.5386592841605695, + "grad_norm": 0.5827310681343079, + "learning_rate": 4.615384615384616e-05, + "loss": 0.0652, + "step": 5835 + }, + { + "epoch": 1.5399775888207765, + "grad_norm": 0.006390869617462158, + "learning_rate": 4.6021902625676214e-05, + "loss": 0.0022, + "step": 5840 + }, + { + "epoch": 1.5412958934809835, + "grad_norm": 0.022760871797800064, + "learning_rate": 4.588995909750627e-05, + "loss": 0.0311, + "step": 5845 + }, + { + "epoch": 1.5426141981411905, + "grad_norm": 0.22773241996765137, + "learning_rate": 4.5758015569336325e-05, + "loss": 0.0051, + "step": 5850 + }, + { + "epoch": 1.5439325028013973, + "grad_norm": 0.015375247225165367, + "learning_rate": 4.562607204116639e-05, + "loss": 0.0023, + "step": 5855 + }, + { + "epoch": 1.5452508074616045, + "grad_norm": 0.007347101345658302, + "learning_rate": 4.549412851299644e-05, + "loss": 0.0437, + "step": 5860 + }, + { + "epoch": 1.5465691121218113, + "grad_norm": 0.012344900518655777, + "learning_rate": 4.536218498482649e-05, + "loss": 0.004, + "step": 5865 + }, + { + "epoch": 1.5478874167820185, + "grad_norm": 0.27038896083831787, + "learning_rate": 4.5230241456656555e-05, + "loss": 0.0047, + "step": 5870 + }, + { + "epoch": 1.5492057214422252, + "grad_norm": 0.016395213082432747, + "learning_rate": 4.509829792848661e-05, + "loss": 0.0026, + "step": 5875 + }, + { + "epoch": 1.5505240261024322, + "grad_norm": 0.4217267632484436, + "learning_rate": 4.4966354400316666e-05, + "loss": 0.0364, + "step": 5880 + }, + { + "epoch": 1.5518423307626392, + "grad_norm": 0.20046105980873108, + "learning_rate": 4.483441087214673e-05, + "loss": 0.0243, + "step": 5885 + }, + { + "epoch": 1.5531606354228462, + "grad_norm": 0.004307698458433151, + "learning_rate": 4.470246734397678e-05, + "loss": 0.0064, + "step": 5890 + }, + { + "epoch": 1.5544789400830532, + "grad_norm": 0.46102187037467957, + "learning_rate": 4.457052381580683e-05, + "loss": 0.0115, + "step": 5895 + }, + { + "epoch": 1.5557972447432602, + "grad_norm": 0.0689118504524231, + "learning_rate": 4.4438580287636895e-05, + "loss": 0.0334, + "step": 5900 + }, + { + "epoch": 1.5571155494034672, + "grad_norm": 0.003091114340350032, + "learning_rate": 4.430663675946695e-05, + "loss": 0.0246, + "step": 5905 + }, + { + "epoch": 1.558433854063674, + "grad_norm": 0.003877349430695176, + "learning_rate": 4.417469323129701e-05, + "loss": 0.0032, + "step": 5910 + }, + { + "epoch": 1.5597521587238812, + "grad_norm": 0.30713143944740295, + "learning_rate": 4.404274970312706e-05, + "loss": 0.0229, + "step": 5915 + }, + { + "epoch": 1.561070463384088, + "grad_norm": 0.07344445586204529, + "learning_rate": 4.391080617495712e-05, + "loss": 0.0078, + "step": 5920 + }, + { + "epoch": 1.5623887680442952, + "grad_norm": 0.01774723082780838, + "learning_rate": 4.377886264678718e-05, + "loss": 0.0034, + "step": 5925 + }, + { + "epoch": 1.563707072704502, + "grad_norm": 0.476324200630188, + "learning_rate": 4.3646919118617236e-05, + "loss": 0.0071, + "step": 5930 + }, + { + "epoch": 1.5650253773647091, + "grad_norm": 0.11624465882778168, + "learning_rate": 4.351497559044729e-05, + "loss": 0.0236, + "step": 5935 + }, + { + "epoch": 1.566343682024916, + "grad_norm": 0.190691277384758, + "learning_rate": 4.338303206227735e-05, + "loss": 0.006, + "step": 5940 + }, + { + "epoch": 1.567661986685123, + "grad_norm": 0.20517045259475708, + "learning_rate": 4.32510885341074e-05, + "loss": 0.009, + "step": 5945 + }, + { + "epoch": 1.56898029134533, + "grad_norm": 0.008122317492961884, + "learning_rate": 4.311914500593746e-05, + "loss": 0.0041, + "step": 5950 + }, + { + "epoch": 1.570298596005537, + "grad_norm": 0.01982291042804718, + "learning_rate": 4.298720147776752e-05, + "loss": 0.0258, + "step": 5955 + }, + { + "epoch": 1.5716169006657439, + "grad_norm": 0.000996922142803669, + "learning_rate": 4.285525794959758e-05, + "loss": 0.0233, + "step": 5960 + }, + { + "epoch": 1.5729352053259509, + "grad_norm": 0.09725592285394669, + "learning_rate": 4.272331442142763e-05, + "loss": 0.0218, + "step": 5965 + }, + { + "epoch": 1.5742535099861579, + "grad_norm": 0.0672350749373436, + "learning_rate": 4.259137089325769e-05, + "loss": 0.0194, + "step": 5970 + }, + { + "epoch": 1.5755718146463646, + "grad_norm": 0.014844833873212337, + "learning_rate": 4.2459427365087744e-05, + "loss": 0.0298, + "step": 5975 + }, + { + "epoch": 1.5768901193065719, + "grad_norm": 0.030519040301442146, + "learning_rate": 4.2327483836917806e-05, + "loss": 0.0178, + "step": 5980 + }, + { + "epoch": 1.5782084239667786, + "grad_norm": 0.018561460077762604, + "learning_rate": 4.219554030874786e-05, + "loss": 0.0154, + "step": 5985 + }, + { + "epoch": 1.5795267286269858, + "grad_norm": 0.02470085583627224, + "learning_rate": 4.206359678057791e-05, + "loss": 0.0361, + "step": 5990 + }, + { + "epoch": 1.5808450332871926, + "grad_norm": 0.055412422865629196, + "learning_rate": 4.193165325240797e-05, + "loss": 0.0162, + "step": 5995 + }, + { + "epoch": 1.5821633379473996, + "grad_norm": 0.0034158769994974136, + "learning_rate": 4.179970972423803e-05, + "loss": 0.0068, + "step": 6000 + }, + { + "epoch": 1.5821633379473996, + "eval_loss": 0.024797894060611725, + "eval_runtime": 452.1611, + "eval_samples_per_second": 7.458, + "eval_steps_per_second": 3.729, + "step": 6000 + } + ], + "logging_steps": 5, + "max_steps": 7584, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.8426226717635994e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-6000/training_args.bin b/checkpoint-6000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4c05e0585b1aef03c0cbe4c50207ce04940bd838 --- /dev/null +++ b/checkpoint-6000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79dfa687fdd0c9908ab6b63535817e7567b29b0b483ac228723218f6f5fdeec5 +size 5688 diff --git a/checkpoint-6500/README.md b/checkpoint-6500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e738b5eda9883f4a45efd9d37b8b8357ba758456 --- /dev/null +++ b/checkpoint-6500/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/Llama-3.2-3B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-6500/adapter_config.json b/checkpoint-6500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..852d02d4dc1204f4332be8b3363041a3d3e3feb3 --- /dev/null +++ b/checkpoint-6500/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/Llama-3.2-3B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "gate_proj", + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-6500/adapter_model.safetensors b/checkpoint-6500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..419b5a3f663bfbb96602189cccab165f8f7d20a6 --- /dev/null +++ b/checkpoint-6500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:944c167a0f62480450153182ddac47ef83c870f9c6321990610e8c6ec350a916 +size 97307544 diff --git a/checkpoint-6500/optimizer.pt b/checkpoint-6500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c6a8e883b297b405fec98fe5b088f7901e571f2 --- /dev/null +++ b/checkpoint-6500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b1eed69a75dadc91c6b40c7758e8c08c059b5847a9e66ba42d1e32b079d2056 +size 50866370 diff --git a/checkpoint-6500/rng_state.pth b/checkpoint-6500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a7ab1362c9d28971308c91701c9c4ab050d1ec5d --- /dev/null +++ b/checkpoint-6500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2545847c503bf67daa3a4c88b39f31a788219a98b9e09ef90e4792c9b43f9e9 +size 14244 diff --git a/checkpoint-6500/scheduler.pt b/checkpoint-6500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5954d500bbccd9723ed11095960abb64933734d7 --- /dev/null +++ b/checkpoint-6500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbc99e05666985ad1073fbdfd466eac447bfd3fe2ec0280dbecde6e609546e3a +size 1064 diff --git a/checkpoint-6500/special_tokens_map.json b/checkpoint-6500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-6500/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-6500/tokenizer.json b/checkpoint-6500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-6500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-6500/tokenizer_config.json b/checkpoint-6500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfd835b0ef1033e89629af6e13ae45397e9fa2f8 --- /dev/null +++ b/checkpoint-6500/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 July 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-6500/trainer_state.json b/checkpoint-6500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5e3f6830ff331277ec2e2c6a35523957ddef3f89 --- /dev/null +++ b/checkpoint-6500/trainer_state.json @@ -0,0 +1,9240 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.7139938039680969, + "eval_steps": 500, + "global_step": 6500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001318304660206974, + "grad_norm": 4.59375, + "learning_rate": 0.0002, + "loss": 1.9624, + "step": 5 + }, + { + "epoch": 0.002636609320413948, + "grad_norm": 1.7421875, + "learning_rate": 0.00019986805647183008, + "loss": 0.6513, + "step": 10 + }, + { + "epoch": 0.003954913980620921, + "grad_norm": 1.84375, + "learning_rate": 0.00019973611294366012, + "loss": 0.1146, + "step": 15 + }, + { + "epoch": 0.005273218640827896, + "grad_norm": 1.3203125, + "learning_rate": 0.0001996041694154902, + "loss": 0.0529, + "step": 20 + }, + { + "epoch": 0.006591523301034869, + "grad_norm": 0.40234375, + "learning_rate": 0.00019947222588732023, + "loss": 0.1214, + "step": 25 + }, + { + "epoch": 0.007909827961241843, + "grad_norm": 1.5390625, + "learning_rate": 0.0001993402823591503, + "loss": 0.0919, + "step": 30 + }, + { + "epoch": 0.009228132621448816, + "grad_norm": 0.06201171875, + "learning_rate": 0.00019920833883098034, + "loss": 0.09, + "step": 35 + }, + { + "epoch": 0.010546437281655791, + "grad_norm": 1.53125, + "learning_rate": 0.0001990763953028104, + "loss": 0.1945, + "step": 40 + }, + { + "epoch": 0.011864741941862765, + "grad_norm": 0.2890625, + "learning_rate": 0.00019894445177464048, + "loss": 0.1259, + "step": 45 + }, + { + "epoch": 0.013183046602069738, + "grad_norm": 0.609375, + "learning_rate": 0.00019881250824647052, + "loss": 0.027, + "step": 50 + }, + { + "epoch": 0.014501351262276712, + "grad_norm": 0.369140625, + "learning_rate": 0.00019868056471830057, + "loss": 0.1068, + "step": 55 + }, + { + "epoch": 0.015819655922483685, + "grad_norm": 0.34765625, + "learning_rate": 0.00019854862119013064, + "loss": 0.0542, + "step": 60 + }, + { + "epoch": 0.01713796058269066, + "grad_norm": 0.055419921875, + "learning_rate": 0.00019841667766196068, + "loss": 0.0901, + "step": 65 + }, + { + "epoch": 0.018456265242897632, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00019828473413379075, + "loss": 0.0091, + "step": 70 + }, + { + "epoch": 0.019774569903104607, + "grad_norm": 0.0079345703125, + "learning_rate": 0.0001981527906056208, + "loss": 0.0744, + "step": 75 + }, + { + "epoch": 0.021092874563311582, + "grad_norm": 0.65234375, + "learning_rate": 0.00019802084707745086, + "loss": 0.1108, + "step": 80 + }, + { + "epoch": 0.022411179223518554, + "grad_norm": 0.50390625, + "learning_rate": 0.0001978889035492809, + "loss": 0.0446, + "step": 85 + }, + { + "epoch": 0.02372948388372553, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019775696002111097, + "loss": 0.0982, + "step": 90 + }, + { + "epoch": 0.0250477885439325, + "grad_norm": 0.490234375, + "learning_rate": 0.00019762501649294104, + "loss": 0.1035, + "step": 95 + }, + { + "epoch": 0.026366093204139476, + "grad_norm": 0.12158203125, + "learning_rate": 0.00019749307296477108, + "loss": 0.0401, + "step": 100 + }, + { + "epoch": 0.02768439786434645, + "grad_norm": 0.16015625, + "learning_rate": 0.00019736112943660115, + "loss": 0.0309, + "step": 105 + }, + { + "epoch": 0.029002702524553423, + "grad_norm": 1.359375, + "learning_rate": 0.0001972291859084312, + "loss": 0.1032, + "step": 110 + }, + { + "epoch": 0.0303210071847604, + "grad_norm": 0.52734375, + "learning_rate": 0.00019709724238026126, + "loss": 0.0811, + "step": 115 + }, + { + "epoch": 0.03163931184496737, + "grad_norm": 0.177734375, + "learning_rate": 0.00019696529885209133, + "loss": 0.0258, + "step": 120 + }, + { + "epoch": 0.03295761650517435, + "grad_norm": 0.234375, + "learning_rate": 0.00019683335532392137, + "loss": 0.0437, + "step": 125 + }, + { + "epoch": 0.03427592116538132, + "grad_norm": 1.3046875, + "learning_rate": 0.00019670141179575144, + "loss": 0.0967, + "step": 130 + }, + { + "epoch": 0.03559422582558829, + "grad_norm": 0.2734375, + "learning_rate": 0.00019656946826758148, + "loss": 0.0132, + "step": 135 + }, + { + "epoch": 0.036912530485795264, + "grad_norm": 0.66015625, + "learning_rate": 0.00019643752473941155, + "loss": 0.0396, + "step": 140 + }, + { + "epoch": 0.03823083514600224, + "grad_norm": 1.0546875, + "learning_rate": 0.0001963055812112416, + "loss": 0.0449, + "step": 145 + }, + { + "epoch": 0.039549139806209214, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019617363768307166, + "loss": 0.1196, + "step": 150 + }, + { + "epoch": 0.040867444466416186, + "grad_norm": 0.5859375, + "learning_rate": 0.0001960416941549017, + "loss": 0.0588, + "step": 155 + }, + { + "epoch": 0.042185749126623165, + "grad_norm": 0.06005859375, + "learning_rate": 0.00019590975062673175, + "loss": 0.0234, + "step": 160 + }, + { + "epoch": 0.04350405378683014, + "grad_norm": 0.4921875, + "learning_rate": 0.00019577780709856182, + "loss": 0.0916, + "step": 165 + }, + { + "epoch": 0.04482235844703711, + "grad_norm": 0.84375, + "learning_rate": 0.0001956458635703919, + "loss": 0.0271, + "step": 170 + }, + { + "epoch": 0.04614066310724409, + "grad_norm": 0.8828125, + "learning_rate": 0.00019551392004222193, + "loss": 0.0175, + "step": 175 + }, + { + "epoch": 0.04745896776745106, + "grad_norm": 0.0152587890625, + "learning_rate": 0.000195381976514052, + "loss": 0.0356, + "step": 180 + }, + { + "epoch": 0.04877727242765803, + "grad_norm": 0.09326171875, + "learning_rate": 0.00019525003298588204, + "loss": 0.0057, + "step": 185 + }, + { + "epoch": 0.050095577087865, + "grad_norm": 0.24609375, + "learning_rate": 0.0001951180894577121, + "loss": 0.0082, + "step": 190 + }, + { + "epoch": 0.05141388174807198, + "grad_norm": 0.05029296875, + "learning_rate": 0.00019498614592954215, + "loss": 0.0178, + "step": 195 + }, + { + "epoch": 0.05273218640827895, + "grad_norm": 0.0390625, + "learning_rate": 0.00019485420240137222, + "loss": 0.0789, + "step": 200 + }, + { + "epoch": 0.054050491068485924, + "grad_norm": 0.5625, + "learning_rate": 0.0001947222588732023, + "loss": 0.0645, + "step": 205 + }, + { + "epoch": 0.0553687957286929, + "grad_norm": 0.53515625, + "learning_rate": 0.00019459031534503233, + "loss": 0.116, + "step": 210 + }, + { + "epoch": 0.056687100388899875, + "grad_norm": 0.55078125, + "learning_rate": 0.0001944583718168624, + "loss": 0.0516, + "step": 215 + }, + { + "epoch": 0.058005405049106847, + "grad_norm": 0.314453125, + "learning_rate": 0.00019432642828869244, + "loss": 0.1019, + "step": 220 + }, + { + "epoch": 0.059323709709313825, + "grad_norm": 0.1123046875, + "learning_rate": 0.0001941944847605225, + "loss": 0.0529, + "step": 225 + }, + { + "epoch": 0.0606420143695208, + "grad_norm": 0.4921875, + "learning_rate": 0.00019406254123235256, + "loss": 0.0368, + "step": 230 + }, + { + "epoch": 0.06196031902972777, + "grad_norm": 0.054443359375, + "learning_rate": 0.00019393059770418262, + "loss": 0.037, + "step": 235 + }, + { + "epoch": 0.06327862368993474, + "grad_norm": 0.008544921875, + "learning_rate": 0.0001937986541760127, + "loss": 0.0324, + "step": 240 + }, + { + "epoch": 0.06459692835014172, + "grad_norm": 1.5, + "learning_rate": 0.00019366671064784274, + "loss": 0.0334, + "step": 245 + }, + { + "epoch": 0.0659152330103487, + "grad_norm": 0.2109375, + "learning_rate": 0.0001935347671196728, + "loss": 0.0671, + "step": 250 + }, + { + "epoch": 0.06723353767055566, + "grad_norm": 2.0625, + "learning_rate": 0.00019340282359150285, + "loss": 0.1559, + "step": 255 + }, + { + "epoch": 0.06855184233076264, + "grad_norm": 0.7734375, + "learning_rate": 0.0001932708800633329, + "loss": 0.0198, + "step": 260 + }, + { + "epoch": 0.06987014699096962, + "grad_norm": 0.42578125, + "learning_rate": 0.00019313893653516296, + "loss": 0.0151, + "step": 265 + }, + { + "epoch": 0.07118845165117658, + "grad_norm": 0.1884765625, + "learning_rate": 0.000193006993006993, + "loss": 0.0269, + "step": 270 + }, + { + "epoch": 0.07250675631138356, + "grad_norm": 1.546875, + "learning_rate": 0.00019287504947882307, + "loss": 0.0565, + "step": 275 + }, + { + "epoch": 0.07382506097159053, + "grad_norm": 0.5078125, + "learning_rate": 0.0001927431059506531, + "loss": 0.0942, + "step": 280 + }, + { + "epoch": 0.0751433656317975, + "grad_norm": 0.392578125, + "learning_rate": 0.00019261116242248318, + "loss": 0.0061, + "step": 285 + }, + { + "epoch": 0.07646167029200449, + "grad_norm": 1.9140625, + "learning_rate": 0.00019247921889431325, + "loss": 0.0497, + "step": 290 + }, + { + "epoch": 0.07777997495221145, + "grad_norm": 0.08837890625, + "learning_rate": 0.0001923472753661433, + "loss": 0.0573, + "step": 295 + }, + { + "epoch": 0.07909827961241843, + "grad_norm": 1.046875, + "learning_rate": 0.00019221533183797336, + "loss": 0.0528, + "step": 300 + }, + { + "epoch": 0.08041658427262541, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001920833883098034, + "loss": 0.0506, + "step": 305 + }, + { + "epoch": 0.08173488893283237, + "grad_norm": 0.08203125, + "learning_rate": 0.00019195144478163347, + "loss": 0.0307, + "step": 310 + }, + { + "epoch": 0.08305319359303935, + "grad_norm": 0.111328125, + "learning_rate": 0.00019181950125346354, + "loss": 0.0365, + "step": 315 + }, + { + "epoch": 0.08437149825324633, + "grad_norm": 1.2890625, + "learning_rate": 0.00019168755772529358, + "loss": 0.0447, + "step": 320 + }, + { + "epoch": 0.0856898029134533, + "grad_norm": 0.6015625, + "learning_rate": 0.00019155561419712365, + "loss": 0.0605, + "step": 325 + }, + { + "epoch": 0.08700810757366027, + "grad_norm": 0.71875, + "learning_rate": 0.0001914236706689537, + "loss": 0.0846, + "step": 330 + }, + { + "epoch": 0.08832641223386725, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019129172714078376, + "loss": 0.0713, + "step": 335 + }, + { + "epoch": 0.08964471689407422, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001911597836126138, + "loss": 0.0826, + "step": 340 + }, + { + "epoch": 0.0909630215542812, + "grad_norm": 2.203125, + "learning_rate": 0.00019102784008444388, + "loss": 0.0441, + "step": 345 + }, + { + "epoch": 0.09228132621448817, + "grad_norm": 1.21875, + "learning_rate": 0.00019089589655627395, + "loss": 0.1378, + "step": 350 + }, + { + "epoch": 0.09359963087469514, + "grad_norm": 3.0625, + "learning_rate": 0.00019076395302810396, + "loss": 0.1552, + "step": 355 + }, + { + "epoch": 0.09491793553490212, + "grad_norm": 0.232421875, + "learning_rate": 0.00019063200949993403, + "loss": 0.0458, + "step": 360 + }, + { + "epoch": 0.0962362401951091, + "grad_norm": 0.71875, + "learning_rate": 0.0001905000659717641, + "loss": 0.0312, + "step": 365 + }, + { + "epoch": 0.09755454485531606, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00019036812244359414, + "loss": 0.0247, + "step": 370 + }, + { + "epoch": 0.09887284951552304, + "grad_norm": 0.064453125, + "learning_rate": 0.0001902361789154242, + "loss": 0.054, + "step": 375 + }, + { + "epoch": 0.10019115417573, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019010423538725425, + "loss": 0.0023, + "step": 380 + }, + { + "epoch": 0.10150945883593698, + "grad_norm": 0.0361328125, + "learning_rate": 0.00018997229185908432, + "loss": 0.0884, + "step": 385 + }, + { + "epoch": 0.10282776349614396, + "grad_norm": 1.703125, + "learning_rate": 0.00018984034833091436, + "loss": 0.0506, + "step": 390 + }, + { + "epoch": 0.10414606815635093, + "grad_norm": 0.08837890625, + "learning_rate": 0.00018970840480274443, + "loss": 0.1123, + "step": 395 + }, + { + "epoch": 0.1054643728165579, + "grad_norm": 0.6953125, + "learning_rate": 0.0001895764612745745, + "loss": 0.0597, + "step": 400 + }, + { + "epoch": 0.10678267747676488, + "grad_norm": 0.18359375, + "learning_rate": 0.00018944451774640454, + "loss": 0.0138, + "step": 405 + }, + { + "epoch": 0.10810098213697185, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001893125742182346, + "loss": 0.0249, + "step": 410 + }, + { + "epoch": 0.10941928679717883, + "grad_norm": 0.00970458984375, + "learning_rate": 0.00018918063069006466, + "loss": 0.0084, + "step": 415 + }, + { + "epoch": 0.1107375914573858, + "grad_norm": 0.54296875, + "learning_rate": 0.00018904868716189472, + "loss": 0.0541, + "step": 420 + }, + { + "epoch": 0.11205589611759277, + "grad_norm": 0.74609375, + "learning_rate": 0.00018891674363372477, + "loss": 0.007, + "step": 425 + }, + { + "epoch": 0.11337420077779975, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00018878480010555484, + "loss": 0.0875, + "step": 430 + }, + { + "epoch": 0.11469250543800673, + "grad_norm": 0.9296875, + "learning_rate": 0.0001886528565773849, + "loss": 0.1207, + "step": 435 + }, + { + "epoch": 0.11601081009821369, + "grad_norm": 1.2734375, + "learning_rate": 0.00018852091304921495, + "loss": 0.1143, + "step": 440 + }, + { + "epoch": 0.11732911475842067, + "grad_norm": 0.6484375, + "learning_rate": 0.00018838896952104502, + "loss": 0.0393, + "step": 445 + }, + { + "epoch": 0.11864741941862765, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018825702599287506, + "loss": 0.02, + "step": 450 + }, + { + "epoch": 0.11996572407883462, + "grad_norm": 0.486328125, + "learning_rate": 0.0001881250824647051, + "loss": 0.0891, + "step": 455 + }, + { + "epoch": 0.1212840287390416, + "grad_norm": 1.0, + "learning_rate": 0.00018799313893653517, + "loss": 0.0469, + "step": 460 + }, + { + "epoch": 0.12260233339924857, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001878611954083652, + "loss": 0.019, + "step": 465 + }, + { + "epoch": 0.12392063805945554, + "grad_norm": 0.03857421875, + "learning_rate": 0.00018772925188019528, + "loss": 0.007, + "step": 470 + }, + { + "epoch": 0.12523894271966252, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00018759730835202532, + "loss": 0.0039, + "step": 475 + }, + { + "epoch": 0.12655724737986948, + "grad_norm": 0.014404296875, + "learning_rate": 0.0001874653648238554, + "loss": 0.0043, + "step": 480 + }, + { + "epoch": 0.12787555204007647, + "grad_norm": 0.51953125, + "learning_rate": 0.00018733342129568546, + "loss": 0.1326, + "step": 485 + }, + { + "epoch": 0.12919385670028344, + "grad_norm": 0.99609375, + "learning_rate": 0.0001872014777675155, + "loss": 0.0369, + "step": 490 + }, + { + "epoch": 0.1305121613604904, + "grad_norm": 0.2734375, + "learning_rate": 0.00018706953423934557, + "loss": 0.0395, + "step": 495 + }, + { + "epoch": 0.1318304660206974, + "grad_norm": 0.083984375, + "learning_rate": 0.00018693759071117561, + "loss": 0.0284, + "step": 500 + }, + { + "epoch": 0.1318304660206974, + "eval_loss": 0.04542969539761543, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.5293, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 3.685, + "step": 500 + }, + { + "epoch": 0.13314877068090436, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00018680564718300568, + "loss": 0.0533, + "step": 505 + }, + { + "epoch": 0.13446707534111133, + "grad_norm": 0.71484375, + "learning_rate": 0.00018667370365483575, + "loss": 0.0183, + "step": 510 + }, + { + "epoch": 0.13578538000131832, + "grad_norm": 0.018798828125, + "learning_rate": 0.0001865417601266658, + "loss": 0.0473, + "step": 515 + }, + { + "epoch": 0.13710368466152528, + "grad_norm": 0.388671875, + "learning_rate": 0.00018640981659849586, + "loss": 0.0562, + "step": 520 + }, + { + "epoch": 0.13842198932173225, + "grad_norm": 0.77734375, + "learning_rate": 0.0001862778730703259, + "loss": 0.0755, + "step": 525 + }, + { + "epoch": 0.13974029398193924, + "grad_norm": 2.8125, + "learning_rate": 0.00018614592954215598, + "loss": 0.0422, + "step": 530 + }, + { + "epoch": 0.1410585986421462, + "grad_norm": 0.48828125, + "learning_rate": 0.00018601398601398602, + "loss": 0.0882, + "step": 535 + }, + { + "epoch": 0.14237690330235317, + "grad_norm": 0.16015625, + "learning_rate": 0.0001858820424858161, + "loss": 0.0131, + "step": 540 + }, + { + "epoch": 0.14369520796256013, + "grad_norm": 0.31640625, + "learning_rate": 0.00018575009895764616, + "loss": 0.03, + "step": 545 + }, + { + "epoch": 0.14501351262276713, + "grad_norm": 0.0120849609375, + "learning_rate": 0.0001856181554294762, + "loss": 0.0425, + "step": 550 + }, + { + "epoch": 0.1463318172829741, + "grad_norm": 0.390625, + "learning_rate": 0.00018548621190130624, + "loss": 0.011, + "step": 555 + }, + { + "epoch": 0.14765012194318106, + "grad_norm": 1.9609375, + "learning_rate": 0.0001853542683731363, + "loss": 0.0807, + "step": 560 + }, + { + "epoch": 0.14896842660338805, + "grad_norm": 0.609375, + "learning_rate": 0.00018522232484496635, + "loss": 0.0278, + "step": 565 + }, + { + "epoch": 0.150286731263595, + "grad_norm": 0.087890625, + "learning_rate": 0.00018509038131679642, + "loss": 0.0484, + "step": 570 + }, + { + "epoch": 0.15160503592380198, + "grad_norm": 0.5078125, + "learning_rate": 0.00018495843778862646, + "loss": 0.1277, + "step": 575 + }, + { + "epoch": 0.15292334058400897, + "grad_norm": 0.8125, + "learning_rate": 0.00018482649426045653, + "loss": 0.058, + "step": 580 + }, + { + "epoch": 0.15424164524421594, + "grad_norm": 0.22265625, + "learning_rate": 0.00018469455073228657, + "loss": 0.0259, + "step": 585 + }, + { + "epoch": 0.1555599499044229, + "grad_norm": 1.8984375, + "learning_rate": 0.00018456260720411664, + "loss": 0.113, + "step": 590 + }, + { + "epoch": 0.1568782545646299, + "grad_norm": 0.12451171875, + "learning_rate": 0.0001844306636759467, + "loss": 0.0312, + "step": 595 + }, + { + "epoch": 0.15819655922483686, + "grad_norm": 0.0322265625, + "learning_rate": 0.00018429872014777676, + "loss": 0.0476, + "step": 600 + }, + { + "epoch": 0.15951486388504382, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00018416677661960682, + "loss": 0.0232, + "step": 605 + }, + { + "epoch": 0.16083316854525082, + "grad_norm": 0.57421875, + "learning_rate": 0.00018403483309143687, + "loss": 0.1287, + "step": 610 + }, + { + "epoch": 0.16215147320545778, + "grad_norm": 0.765625, + "learning_rate": 0.00018390288956326694, + "loss": 0.0991, + "step": 615 + }, + { + "epoch": 0.16346977786566474, + "grad_norm": 0.3125, + "learning_rate": 0.00018377094603509698, + "loss": 0.0247, + "step": 620 + }, + { + "epoch": 0.16478808252587174, + "grad_norm": 0.37890625, + "learning_rate": 0.00018363900250692705, + "loss": 0.0632, + "step": 625 + }, + { + "epoch": 0.1661063871860787, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018350705897875712, + "loss": 0.0314, + "step": 630 + }, + { + "epoch": 0.16742469184628567, + "grad_norm": 0.0673828125, + "learning_rate": 0.00018337511545058716, + "loss": 0.0425, + "step": 635 + }, + { + "epoch": 0.16874299650649266, + "grad_norm": 0.396484375, + "learning_rate": 0.00018324317192241723, + "loss": 0.0613, + "step": 640 + }, + { + "epoch": 0.17006130116669962, + "grad_norm": 0.057373046875, + "learning_rate": 0.00018311122839424727, + "loss": 0.0569, + "step": 645 + }, + { + "epoch": 0.1713796058269066, + "grad_norm": 0.001373291015625, + "learning_rate": 0.00018297928486607734, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 0.17269791048711358, + "grad_norm": 1.0859375, + "learning_rate": 0.00018284734133790738, + "loss": 0.0189, + "step": 655 + }, + { + "epoch": 0.17401621514732055, + "grad_norm": 0.6015625, + "learning_rate": 0.00018271539780973742, + "loss": 0.0601, + "step": 660 + }, + { + "epoch": 0.1753345198075275, + "grad_norm": 0.25390625, + "learning_rate": 0.0001825834542815675, + "loss": 0.0211, + "step": 665 + }, + { + "epoch": 0.1766528244677345, + "grad_norm": 2.6875, + "learning_rate": 0.00018245151075339753, + "loss": 0.0713, + "step": 670 + }, + { + "epoch": 0.17797112912794147, + "grad_norm": 1.1875, + "learning_rate": 0.0001823195672252276, + "loss": 0.0522, + "step": 675 + }, + { + "epoch": 0.17928943378814843, + "grad_norm": 0.025146484375, + "learning_rate": 0.00018218762369705767, + "loss": 0.0242, + "step": 680 + }, + { + "epoch": 0.18060773844835543, + "grad_norm": 0.048095703125, + "learning_rate": 0.00018205568016888772, + "loss": 0.0129, + "step": 685 + }, + { + "epoch": 0.1819260431085624, + "grad_norm": 0.04541015625, + "learning_rate": 0.00018192373664071778, + "loss": 0.0142, + "step": 690 + }, + { + "epoch": 0.18324434776876936, + "grad_norm": 0.00830078125, + "learning_rate": 0.00018179179311254783, + "loss": 0.0121, + "step": 695 + }, + { + "epoch": 0.18456265242897635, + "grad_norm": 0.53125, + "learning_rate": 0.0001816598495843779, + "loss": 0.0163, + "step": 700 + }, + { + "epoch": 0.1858809570891833, + "grad_norm": 0.185546875, + "learning_rate": 0.00018152790605620796, + "loss": 0.0203, + "step": 705 + }, + { + "epoch": 0.18719926174939028, + "grad_norm": 1.2578125, + "learning_rate": 0.000181395962528038, + "loss": 0.1548, + "step": 710 + }, + { + "epoch": 0.18851756640959727, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00018126401899986808, + "loss": 0.0543, + "step": 715 + }, + { + "epoch": 0.18983587106980424, + "grad_norm": 0.07568359375, + "learning_rate": 0.00018113207547169812, + "loss": 0.0346, + "step": 720 + }, + { + "epoch": 0.1911541757300112, + "grad_norm": 0.1318359375, + "learning_rate": 0.0001810001319435282, + "loss": 0.03, + "step": 725 + }, + { + "epoch": 0.1924724803902182, + "grad_norm": 0.1455078125, + "learning_rate": 0.00018086818841535823, + "loss": 0.0796, + "step": 730 + }, + { + "epoch": 0.19379078505042516, + "grad_norm": 0.09814453125, + "learning_rate": 0.0001807362448871883, + "loss": 0.0662, + "step": 735 + }, + { + "epoch": 0.19510908971063212, + "grad_norm": 0.91015625, + "learning_rate": 0.00018060430135901837, + "loss": 0.0675, + "step": 740 + }, + { + "epoch": 0.19642739437083911, + "grad_norm": 0.10693359375, + "learning_rate": 0.0001804723578308484, + "loss": 0.0377, + "step": 745 + }, + { + "epoch": 0.19774569903104608, + "grad_norm": 0.95703125, + "learning_rate": 0.00018034041430267848, + "loss": 0.0174, + "step": 750 + }, + { + "epoch": 0.19906400369125304, + "grad_norm": 1.7890625, + "learning_rate": 0.00018020847077450852, + "loss": 0.0278, + "step": 755 + }, + { + "epoch": 0.20038230835146, + "grad_norm": 0.8515625, + "learning_rate": 0.00018007652724633856, + "loss": 0.0113, + "step": 760 + }, + { + "epoch": 0.201700613011667, + "grad_norm": 0.016845703125, + "learning_rate": 0.00017994458371816863, + "loss": 0.0589, + "step": 765 + }, + { + "epoch": 0.20301891767187397, + "grad_norm": 0.01043701171875, + "learning_rate": 0.00017981264018999867, + "loss": 0.0203, + "step": 770 + }, + { + "epoch": 0.20433722233208093, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00017968069666182874, + "loss": 0.0494, + "step": 775 + }, + { + "epoch": 0.20565552699228792, + "grad_norm": 0.56640625, + "learning_rate": 0.00017954875313365879, + "loss": 0.0394, + "step": 780 + }, + { + "epoch": 0.2069738316524949, + "grad_norm": 0.06591796875, + "learning_rate": 0.00017941680960548886, + "loss": 0.0848, + "step": 785 + }, + { + "epoch": 0.20829213631270185, + "grad_norm": 0.40234375, + "learning_rate": 0.00017928486607731892, + "loss": 0.0464, + "step": 790 + }, + { + "epoch": 0.20961044097290885, + "grad_norm": 0.06298828125, + "learning_rate": 0.00017915292254914897, + "loss": 0.0222, + "step": 795 + }, + { + "epoch": 0.2109287456331158, + "grad_norm": 0.5390625, + "learning_rate": 0.00017902097902097904, + "loss": 0.0434, + "step": 800 + }, + { + "epoch": 0.21224705029332278, + "grad_norm": 1.390625, + "learning_rate": 0.00017888903549280908, + "loss": 0.0222, + "step": 805 + }, + { + "epoch": 0.21356535495352977, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00017875709196463915, + "loss": 0.0099, + "step": 810 + }, + { + "epoch": 0.21488365961373673, + "grad_norm": 0.10009765625, + "learning_rate": 0.0001786251484364692, + "loss": 0.0086, + "step": 815 + }, + { + "epoch": 0.2162019642739437, + "grad_norm": 0.06396484375, + "learning_rate": 0.00017849320490829926, + "loss": 0.0715, + "step": 820 + }, + { + "epoch": 0.2175202689341507, + "grad_norm": 0.365234375, + "learning_rate": 0.00017836126138012933, + "loss": 0.0642, + "step": 825 + }, + { + "epoch": 0.21883857359435765, + "grad_norm": 0.01519775390625, + "learning_rate": 0.00017822931785195937, + "loss": 0.0111, + "step": 830 + }, + { + "epoch": 0.22015687825456462, + "grad_norm": 1.1640625, + "learning_rate": 0.00017809737432378944, + "loss": 0.0518, + "step": 835 + }, + { + "epoch": 0.2214751829147716, + "grad_norm": 0.00921630859375, + "learning_rate": 0.00017796543079561948, + "loss": 0.0384, + "step": 840 + }, + { + "epoch": 0.22279348757497858, + "grad_norm": 0.33984375, + "learning_rate": 0.00017783348726744955, + "loss": 0.0204, + "step": 845 + }, + { + "epoch": 0.22411179223518554, + "grad_norm": 0.294921875, + "learning_rate": 0.00017770154373927962, + "loss": 0.0075, + "step": 850 + }, + { + "epoch": 0.22543009689539253, + "grad_norm": 0.033203125, + "learning_rate": 0.00017756960021110963, + "loss": 0.0895, + "step": 855 + }, + { + "epoch": 0.2267484015555995, + "grad_norm": 0.08056640625, + "learning_rate": 0.0001774376566829397, + "loss": 0.1039, + "step": 860 + }, + { + "epoch": 0.22806670621580646, + "grad_norm": 0.55078125, + "learning_rate": 0.00017730571315476975, + "loss": 0.0125, + "step": 865 + }, + { + "epoch": 0.22938501087601346, + "grad_norm": 0.5859375, + "learning_rate": 0.00017717376962659982, + "loss": 0.0381, + "step": 870 + }, + { + "epoch": 0.23070331553622042, + "grad_norm": 0.029052734375, + "learning_rate": 0.00017704182609842988, + "loss": 0.0434, + "step": 875 + }, + { + "epoch": 0.23202162019642739, + "grad_norm": 0.43359375, + "learning_rate": 0.00017690988257025993, + "loss": 0.0799, + "step": 880 + }, + { + "epoch": 0.23333992485663438, + "grad_norm": 0.04150390625, + "learning_rate": 0.00017677793904209, + "loss": 0.0692, + "step": 885 + }, + { + "epoch": 0.23465822951684134, + "grad_norm": 0.435546875, + "learning_rate": 0.00017664599551392004, + "loss": 0.0544, + "step": 890 + }, + { + "epoch": 0.2359765341770483, + "grad_norm": 1.171875, + "learning_rate": 0.0001765140519857501, + "loss": 0.0619, + "step": 895 + }, + { + "epoch": 0.2372948388372553, + "grad_norm": 0.01263427734375, + "learning_rate": 0.00017638210845758018, + "loss": 0.0418, + "step": 900 + }, + { + "epoch": 0.23861314349746227, + "grad_norm": 0.017578125, + "learning_rate": 0.00017625016492941022, + "loss": 0.0195, + "step": 905 + }, + { + "epoch": 0.23993144815766923, + "grad_norm": 0.6171875, + "learning_rate": 0.0001761182214012403, + "loss": 0.067, + "step": 910 + }, + { + "epoch": 0.24124975281787622, + "grad_norm": 0.59765625, + "learning_rate": 0.00017598627787307033, + "loss": 0.049, + "step": 915 + }, + { + "epoch": 0.2425680574780832, + "grad_norm": 1.2421875, + "learning_rate": 0.0001758543343449004, + "loss": 0.0539, + "step": 920 + }, + { + "epoch": 0.24388636213829015, + "grad_norm": 0.10302734375, + "learning_rate": 0.00017572239081673044, + "loss": 0.0725, + "step": 925 + }, + { + "epoch": 0.24520466679849715, + "grad_norm": 0.330078125, + "learning_rate": 0.0001755904472885605, + "loss": 0.064, + "step": 930 + }, + { + "epoch": 0.2465229714587041, + "grad_norm": 0.220703125, + "learning_rate": 0.00017545850376039058, + "loss": 0.0271, + "step": 935 + }, + { + "epoch": 0.24784127611891107, + "grad_norm": 0.01470947265625, + "learning_rate": 0.00017532656023222062, + "loss": 0.0247, + "step": 940 + }, + { + "epoch": 0.24915958077911807, + "grad_norm": 0.013427734375, + "learning_rate": 0.0001751946167040507, + "loss": 0.017, + "step": 945 + }, + { + "epoch": 0.25047788543932503, + "grad_norm": 0.58984375, + "learning_rate": 0.00017506267317588073, + "loss": 0.0254, + "step": 950 + }, + { + "epoch": 0.251796190099532, + "grad_norm": 0.412109375, + "learning_rate": 0.00017493072964771078, + "loss": 0.0186, + "step": 955 + }, + { + "epoch": 0.25311449475973896, + "grad_norm": 0.66796875, + "learning_rate": 0.00017479878611954084, + "loss": 0.0617, + "step": 960 + }, + { + "epoch": 0.25443279941994595, + "grad_norm": 0.322265625, + "learning_rate": 0.00017466684259137089, + "loss": 0.0173, + "step": 965 + }, + { + "epoch": 0.25575110408015295, + "grad_norm": 0.83203125, + "learning_rate": 0.00017453489906320096, + "loss": 0.0512, + "step": 970 + }, + { + "epoch": 0.2570694087403599, + "grad_norm": 0.08447265625, + "learning_rate": 0.000174402955535031, + "loss": 0.0361, + "step": 975 + }, + { + "epoch": 0.2583877134005669, + "grad_norm": 0.423828125, + "learning_rate": 0.00017427101200686107, + "loss": 0.0175, + "step": 980 + }, + { + "epoch": 0.25970601806077387, + "grad_norm": 0.77734375, + "learning_rate": 0.00017413906847869114, + "loss": 0.0139, + "step": 985 + }, + { + "epoch": 0.2610243227209808, + "grad_norm": 0.515625, + "learning_rate": 0.00017400712495052118, + "loss": 0.0948, + "step": 990 + }, + { + "epoch": 0.2623426273811878, + "grad_norm": 1.421875, + "learning_rate": 0.00017387518142235125, + "loss": 0.0406, + "step": 995 + }, + { + "epoch": 0.2636609320413948, + "grad_norm": 0.058837890625, + "learning_rate": 0.0001737432378941813, + "loss": 0.1011, + "step": 1000 + }, + { + "epoch": 0.2636609320413948, + "eval_loss": 0.045552924275398254, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.6113, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 3.684, + "step": 1000 + }, + { + "epoch": 0.26497923670160173, + "grad_norm": 0.380859375, + "learning_rate": 0.00017361129436601136, + "loss": 0.0711, + "step": 1005 + }, + { + "epoch": 0.2662975413618087, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00017347935083784143, + "loss": 0.0218, + "step": 1010 + }, + { + "epoch": 0.2676158460220157, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017334740730967147, + "loss": 0.0301, + "step": 1015 + }, + { + "epoch": 0.26893415068222265, + "grad_norm": 0.2734375, + "learning_rate": 0.00017321546378150154, + "loss": 0.0721, + "step": 1020 + }, + { + "epoch": 0.27025245534242964, + "grad_norm": 0.25390625, + "learning_rate": 0.00017308352025333158, + "loss": 0.0363, + "step": 1025 + }, + { + "epoch": 0.27157076000263664, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017295157672516165, + "loss": 0.0313, + "step": 1030 + }, + { + "epoch": 0.2728890646628436, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0001728196331969917, + "loss": 0.0385, + "step": 1035 + }, + { + "epoch": 0.27420736932305056, + "grad_norm": 0.00787353515625, + "learning_rate": 0.00017268768966882176, + "loss": 0.0405, + "step": 1040 + }, + { + "epoch": 0.27552567398325756, + "grad_norm": 0.484375, + "learning_rate": 0.00017255574614065183, + "loss": 0.0616, + "step": 1045 + }, + { + "epoch": 0.2768439786434645, + "grad_norm": 0.0908203125, + "learning_rate": 0.00017242380261248185, + "loss": 0.0057, + "step": 1050 + }, + { + "epoch": 0.2781622833036715, + "grad_norm": 0.1904296875, + "learning_rate": 0.00017229185908431192, + "loss": 0.0417, + "step": 1055 + }, + { + "epoch": 0.2794805879638785, + "grad_norm": 0.30078125, + "learning_rate": 0.00017215991555614196, + "loss": 0.0346, + "step": 1060 + }, + { + "epoch": 0.2807988926240854, + "grad_norm": 0.016357421875, + "learning_rate": 0.00017202797202797203, + "loss": 0.0295, + "step": 1065 + }, + { + "epoch": 0.2821171972842924, + "grad_norm": 0.490234375, + "learning_rate": 0.0001718960284998021, + "loss": 0.0448, + "step": 1070 + }, + { + "epoch": 0.28343550194449935, + "grad_norm": 0.004241943359375, + "learning_rate": 0.00017176408497163214, + "loss": 0.0051, + "step": 1075 + }, + { + "epoch": 0.28475380660470634, + "grad_norm": 0.01904296875, + "learning_rate": 0.0001716321414434622, + "loss": 0.0894, + "step": 1080 + }, + { + "epoch": 0.28607211126491333, + "grad_norm": 0.83984375, + "learning_rate": 0.00017150019791529225, + "loss": 0.0288, + "step": 1085 + }, + { + "epoch": 0.28739041592512027, + "grad_norm": 0.2021484375, + "learning_rate": 0.00017136825438712232, + "loss": 0.0222, + "step": 1090 + }, + { + "epoch": 0.28870872058532726, + "grad_norm": 0.322265625, + "learning_rate": 0.0001712363108589524, + "loss": 0.0444, + "step": 1095 + }, + { + "epoch": 0.29002702524553425, + "grad_norm": 0.408203125, + "learning_rate": 0.00017110436733078243, + "loss": 0.0828, + "step": 1100 + }, + { + "epoch": 0.2913453299057412, + "grad_norm": 0.04052734375, + "learning_rate": 0.0001709724238026125, + "loss": 0.0725, + "step": 1105 + }, + { + "epoch": 0.2926636345659482, + "grad_norm": 0.2578125, + "learning_rate": 0.00017084048027444254, + "loss": 0.0204, + "step": 1110 + }, + { + "epoch": 0.2939819392261552, + "grad_norm": 0.67578125, + "learning_rate": 0.0001707085367462726, + "loss": 0.0503, + "step": 1115 + }, + { + "epoch": 0.2953002438863621, + "grad_norm": 0.0059814453125, + "learning_rate": 0.00017057659321810265, + "loss": 0.0144, + "step": 1120 + }, + { + "epoch": 0.2966185485465691, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00017044464968993272, + "loss": 0.0044, + "step": 1125 + }, + { + "epoch": 0.2979368532067761, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001703127061617628, + "loss": 0.013, + "step": 1130 + }, + { + "epoch": 0.29925515786698303, + "grad_norm": 0.287109375, + "learning_rate": 0.00017018076263359283, + "loss": 0.0245, + "step": 1135 + }, + { + "epoch": 0.30057346252719, + "grad_norm": 0.26171875, + "learning_rate": 0.0001700488191054229, + "loss": 0.0247, + "step": 1140 + }, + { + "epoch": 0.301891767187397, + "grad_norm": 0.40625, + "learning_rate": 0.00016991687557725294, + "loss": 0.0402, + "step": 1145 + }, + { + "epoch": 0.30321007184760396, + "grad_norm": 1.2578125, + "learning_rate": 0.000169784932049083, + "loss": 0.0071, + "step": 1150 + }, + { + "epoch": 0.30452837650781095, + "grad_norm": 0.330078125, + "learning_rate": 0.00016965298852091306, + "loss": 0.0177, + "step": 1155 + }, + { + "epoch": 0.30584668116801794, + "grad_norm": 0.07275390625, + "learning_rate": 0.0001695210449927431, + "loss": 0.0029, + "step": 1160 + }, + { + "epoch": 0.3071649858282249, + "grad_norm": 0.455078125, + "learning_rate": 0.00016938910146457317, + "loss": 0.0262, + "step": 1165 + }, + { + "epoch": 0.30848329048843187, + "grad_norm": 0.002655029296875, + "learning_rate": 0.0001692571579364032, + "loss": 0.0346, + "step": 1170 + }, + { + "epoch": 0.30980159514863886, + "grad_norm": 0.1748046875, + "learning_rate": 0.00016912521440823328, + "loss": 0.0494, + "step": 1175 + }, + { + "epoch": 0.3111198998088458, + "grad_norm": 1.4609375, + "learning_rate": 0.00016899327088006335, + "loss": 0.0603, + "step": 1180 + }, + { + "epoch": 0.3124382044690528, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001688613273518934, + "loss": 0.0366, + "step": 1185 + }, + { + "epoch": 0.3137565091292598, + "grad_norm": 0.01422119140625, + "learning_rate": 0.00016872938382372346, + "loss": 0.0678, + "step": 1190 + }, + { + "epoch": 0.3150748137894667, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001685974402955535, + "loss": 0.0359, + "step": 1195 + }, + { + "epoch": 0.3163931184496737, + "grad_norm": 0.275390625, + "learning_rate": 0.00016846549676738357, + "loss": 0.1099, + "step": 1200 + }, + { + "epoch": 0.3177114231098807, + "grad_norm": 0.212890625, + "learning_rate": 0.00016833355323921364, + "loss": 0.0343, + "step": 1205 + }, + { + "epoch": 0.31902972777008765, + "grad_norm": 0.0302734375, + "learning_rate": 0.00016820160971104368, + "loss": 0.0138, + "step": 1210 + }, + { + "epoch": 0.32034803243029464, + "grad_norm": 0.016845703125, + "learning_rate": 0.00016806966618287375, + "loss": 0.0202, + "step": 1215 + }, + { + "epoch": 0.32166633709050163, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001679377226547038, + "loss": 0.0442, + "step": 1220 + }, + { + "epoch": 0.32298464175070857, + "grad_norm": 0.049072265625, + "learning_rate": 0.00016780577912653386, + "loss": 0.0375, + "step": 1225 + }, + { + "epoch": 0.32430294641091556, + "grad_norm": 0.1337890625, + "learning_rate": 0.0001676738355983639, + "loss": 0.01, + "step": 1230 + }, + { + "epoch": 0.32562125107112255, + "grad_norm": 0.02197265625, + "learning_rate": 0.00016754189207019397, + "loss": 0.0139, + "step": 1235 + }, + { + "epoch": 0.3269395557313295, + "grad_norm": 0.09228515625, + "learning_rate": 0.00016740994854202404, + "loss": 0.014, + "step": 1240 + }, + { + "epoch": 0.3282578603915365, + "grad_norm": 0.47265625, + "learning_rate": 0.00016727800501385408, + "loss": 0.1546, + "step": 1245 + }, + { + "epoch": 0.3295761650517435, + "grad_norm": 0.02294921875, + "learning_rate": 0.00016714606148568413, + "loss": 0.0803, + "step": 1250 + }, + { + "epoch": 0.3308944697119504, + "grad_norm": 0.185546875, + "learning_rate": 0.00016701411795751417, + "loss": 0.0376, + "step": 1255 + }, + { + "epoch": 0.3322127743721574, + "grad_norm": 0.1123046875, + "learning_rate": 0.00016688217442934424, + "loss": 0.0375, + "step": 1260 + }, + { + "epoch": 0.3335310790323644, + "grad_norm": 1.03125, + "learning_rate": 0.0001667502309011743, + "loss": 0.0442, + "step": 1265 + }, + { + "epoch": 0.33484938369257133, + "grad_norm": 0.0172119140625, + "learning_rate": 0.00016661828737300435, + "loss": 0.0261, + "step": 1270 + }, + { + "epoch": 0.3361676883527783, + "grad_norm": 0.42578125, + "learning_rate": 0.00016648634384483442, + "loss": 0.0553, + "step": 1275 + }, + { + "epoch": 0.3374859930129853, + "grad_norm": 0.1328125, + "learning_rate": 0.00016635440031666446, + "loss": 0.0065, + "step": 1280 + }, + { + "epoch": 0.33880429767319226, + "grad_norm": 0.263671875, + "learning_rate": 0.00016622245678849453, + "loss": 0.0527, + "step": 1285 + }, + { + "epoch": 0.34012260233339925, + "grad_norm": 0.314453125, + "learning_rate": 0.0001660905132603246, + "loss": 0.0297, + "step": 1290 + }, + { + "epoch": 0.34144090699360624, + "grad_norm": 0.04345703125, + "learning_rate": 0.00016595856973215464, + "loss": 0.0477, + "step": 1295 + }, + { + "epoch": 0.3427592116538132, + "grad_norm": 0.08154296875, + "learning_rate": 0.0001658266262039847, + "loss": 0.0298, + "step": 1300 + }, + { + "epoch": 0.34407751631402017, + "grad_norm": 0.08935546875, + "learning_rate": 0.00016569468267581475, + "loss": 0.0481, + "step": 1305 + }, + { + "epoch": 0.34539582097422716, + "grad_norm": 0.06640625, + "learning_rate": 0.00016556273914764482, + "loss": 0.0153, + "step": 1310 + }, + { + "epoch": 0.3467141256344341, + "grad_norm": 0.00592041015625, + "learning_rate": 0.00016543079561947486, + "loss": 0.0111, + "step": 1315 + }, + { + "epoch": 0.3480324302946411, + "grad_norm": 0.2236328125, + "learning_rate": 0.00016529885209130493, + "loss": 0.0309, + "step": 1320 + }, + { + "epoch": 0.3493507349548481, + "grad_norm": 0.0198974609375, + "learning_rate": 0.000165166908563135, + "loss": 0.0579, + "step": 1325 + }, + { + "epoch": 0.350669039615055, + "grad_norm": 0.10107421875, + "learning_rate": 0.00016503496503496504, + "loss": 0.0055, + "step": 1330 + }, + { + "epoch": 0.351987344275262, + "grad_norm": 0.71875, + "learning_rate": 0.00016490302150679511, + "loss": 0.0299, + "step": 1335 + }, + { + "epoch": 0.353305648935469, + "grad_norm": 0.01348876953125, + "learning_rate": 0.00016477107797862516, + "loss": 0.0943, + "step": 1340 + }, + { + "epoch": 0.35462395359567594, + "grad_norm": 0.3046875, + "learning_rate": 0.00016463913445045523, + "loss": 0.0216, + "step": 1345 + }, + { + "epoch": 0.35594225825588294, + "grad_norm": 0.02392578125, + "learning_rate": 0.00016450719092228527, + "loss": 0.0265, + "step": 1350 + }, + { + "epoch": 0.35726056291608993, + "grad_norm": 0.453125, + "learning_rate": 0.0001643752473941153, + "loss": 0.0539, + "step": 1355 + }, + { + "epoch": 0.35857886757629687, + "grad_norm": 0.00823974609375, + "learning_rate": 0.00016424330386594538, + "loss": 0.0139, + "step": 1360 + }, + { + "epoch": 0.35989717223650386, + "grad_norm": 0.55859375, + "learning_rate": 0.00016411136033777542, + "loss": 0.0428, + "step": 1365 + }, + { + "epoch": 0.36121547689671085, + "grad_norm": 0.052734375, + "learning_rate": 0.0001639794168096055, + "loss": 0.0346, + "step": 1370 + }, + { + "epoch": 0.3625337815569178, + "grad_norm": 0.12158203125, + "learning_rate": 0.00016384747328143556, + "loss": 0.0095, + "step": 1375 + }, + { + "epoch": 0.3638520862171248, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0001637155297532656, + "loss": 0.0224, + "step": 1380 + }, + { + "epoch": 0.3651703908773318, + "grad_norm": 0.01318359375, + "learning_rate": 0.00016358358622509567, + "loss": 0.0316, + "step": 1385 + }, + { + "epoch": 0.3664886955375387, + "grad_norm": 0.011962890625, + "learning_rate": 0.0001634516426969257, + "loss": 0.0051, + "step": 1390 + }, + { + "epoch": 0.3678070001977457, + "grad_norm": 0.00396728515625, + "learning_rate": 0.00016331969916875578, + "loss": 0.038, + "step": 1395 + }, + { + "epoch": 0.3691253048579527, + "grad_norm": 0.375, + "learning_rate": 0.00016318775564058585, + "loss": 0.029, + "step": 1400 + }, + { + "epoch": 0.37044360951815963, + "grad_norm": 0.265625, + "learning_rate": 0.0001630558121124159, + "loss": 0.0072, + "step": 1405 + }, + { + "epoch": 0.3717619141783666, + "grad_norm": 0.00127410888671875, + "learning_rate": 0.00016292386858424596, + "loss": 0.0381, + "step": 1410 + }, + { + "epoch": 0.3730802188385736, + "grad_norm": 1.15625, + "learning_rate": 0.000162791925056076, + "loss": 0.0573, + "step": 1415 + }, + { + "epoch": 0.37439852349878056, + "grad_norm": 0.0244140625, + "learning_rate": 0.00016265998152790607, + "loss": 0.051, + "step": 1420 + }, + { + "epoch": 0.37571682815898755, + "grad_norm": 0.0015106201171875, + "learning_rate": 0.00016252803799973612, + "loss": 0.0239, + "step": 1425 + }, + { + "epoch": 0.37703513281919454, + "grad_norm": 0.26953125, + "learning_rate": 0.00016239609447156618, + "loss": 0.0165, + "step": 1430 + }, + { + "epoch": 0.3783534374794015, + "grad_norm": 0.006134033203125, + "learning_rate": 0.00016226415094339625, + "loss": 0.0071, + "step": 1435 + }, + { + "epoch": 0.37967174213960847, + "grad_norm": 2.828125, + "learning_rate": 0.0001621322074152263, + "loss": 0.0272, + "step": 1440 + }, + { + "epoch": 0.38099004679981546, + "grad_norm": 0.349609375, + "learning_rate": 0.00016200026388705637, + "loss": 0.0647, + "step": 1445 + }, + { + "epoch": 0.3823083514600224, + "grad_norm": 0.09326171875, + "learning_rate": 0.00016186832035888638, + "loss": 0.0262, + "step": 1450 + }, + { + "epoch": 0.3836266561202294, + "grad_norm": 0.041015625, + "learning_rate": 0.00016173637683071645, + "loss": 0.0576, + "step": 1455 + }, + { + "epoch": 0.3849449607804364, + "grad_norm": 0.033935546875, + "learning_rate": 0.00016160443330254652, + "loss": 0.0142, + "step": 1460 + }, + { + "epoch": 0.3862632654406433, + "grad_norm": 0.09130859375, + "learning_rate": 0.00016147248977437656, + "loss": 0.0348, + "step": 1465 + }, + { + "epoch": 0.3875815701008503, + "grad_norm": 2.390625, + "learning_rate": 0.00016134054624620663, + "loss": 0.0672, + "step": 1470 + }, + { + "epoch": 0.3888998747610573, + "grad_norm": 0.439453125, + "learning_rate": 0.00016120860271803667, + "loss": 0.0121, + "step": 1475 + }, + { + "epoch": 0.39021817942126424, + "grad_norm": 0.1298828125, + "learning_rate": 0.00016107665918986674, + "loss": 0.0114, + "step": 1480 + }, + { + "epoch": 0.39153648408147124, + "grad_norm": 0.85546875, + "learning_rate": 0.0001609447156616968, + "loss": 0.0968, + "step": 1485 + }, + { + "epoch": 0.39285478874167823, + "grad_norm": 0.703125, + "learning_rate": 0.00016081277213352685, + "loss": 0.0349, + "step": 1490 + }, + { + "epoch": 0.39417309340188517, + "grad_norm": 0.021728515625, + "learning_rate": 0.00016068082860535692, + "loss": 0.0106, + "step": 1495 + }, + { + "epoch": 0.39549139806209216, + "grad_norm": 0.7265625, + "learning_rate": 0.00016054888507718696, + "loss": 0.0225, + "step": 1500 + }, + { + "epoch": 0.39549139806209216, + "eval_loss": 0.03515048325061798, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.3497, + "eval_samples_per_second": 7.373, + "eval_steps_per_second": 3.686, + "step": 1500 + }, + { + "epoch": 0.3968097027222991, + "grad_norm": 0.016519820317626, + "learning_rate": 0.00016041694154901703, + "loss": 0.0202, + "step": 1505 + }, + { + "epoch": 0.3981280073825061, + "grad_norm": 0.8505942225456238, + "learning_rate": 0.00016028499802084708, + "loss": 0.0541, + "step": 1510 + }, + { + "epoch": 0.3994463120427131, + "grad_norm": 0.04163295030593872, + "learning_rate": 0.00016015305449267714, + "loss": 0.0037, + "step": 1515 + }, + { + "epoch": 0.40076461670292, + "grad_norm": 0.011332935653626919, + "learning_rate": 0.00016002111096450721, + "loss": 0.0459, + "step": 1520 + }, + { + "epoch": 0.402082921363127, + "grad_norm": 0.9360129833221436, + "learning_rate": 0.00015988916743633726, + "loss": 0.013, + "step": 1525 + }, + { + "epoch": 0.403401226023334, + "grad_norm": 0.11991436779499054, + "learning_rate": 0.00015975722390816733, + "loss": 0.0079, + "step": 1530 + }, + { + "epoch": 0.40471953068354094, + "grad_norm": 0.36911076307296753, + "learning_rate": 0.00015962528037999737, + "loss": 0.0638, + "step": 1535 + }, + { + "epoch": 0.40603783534374793, + "grad_norm": 0.020278634503483772, + "learning_rate": 0.00015949333685182744, + "loss": 0.0217, + "step": 1540 + }, + { + "epoch": 0.4073561400039549, + "grad_norm": 0.14263059198856354, + "learning_rate": 0.0001593613933236575, + "loss": 0.0495, + "step": 1545 + }, + { + "epoch": 0.40867444466416186, + "grad_norm": 0.09494803845882416, + "learning_rate": 0.00015922944979548752, + "loss": 0.0248, + "step": 1550 + }, + { + "epoch": 0.40999274932436885, + "grad_norm": 0.23064319789409637, + "learning_rate": 0.0001590975062673176, + "loss": 0.0285, + "step": 1555 + }, + { + "epoch": 0.41131105398457585, + "grad_norm": 0.32220256328582764, + "learning_rate": 0.00015896556273914763, + "loss": 0.0537, + "step": 1560 + }, + { + "epoch": 0.4126293586447828, + "grad_norm": 0.41208815574645996, + "learning_rate": 0.0001588336192109777, + "loss": 0.0453, + "step": 1565 + }, + { + "epoch": 0.4139476633049898, + "grad_norm": 0.03775424137711525, + "learning_rate": 0.00015870167568280777, + "loss": 0.0134, + "step": 1570 + }, + { + "epoch": 0.41526596796519677, + "grad_norm": 0.6526333093643188, + "learning_rate": 0.0001585697321546378, + "loss": 0.0329, + "step": 1575 + }, + { + "epoch": 0.4165842726254037, + "grad_norm": 1.001305103302002, + "learning_rate": 0.00015843778862646788, + "loss": 0.0912, + "step": 1580 + }, + { + "epoch": 0.4179025772856107, + "grad_norm": 0.4055219888687134, + "learning_rate": 0.00015830584509829792, + "loss": 0.0519, + "step": 1585 + }, + { + "epoch": 0.4192208819458177, + "grad_norm": 0.035015616565942764, + "learning_rate": 0.000158173901570128, + "loss": 0.0191, + "step": 1590 + }, + { + "epoch": 0.42053918660602463, + "grad_norm": 0.09326844662427902, + "learning_rate": 0.00015804195804195806, + "loss": 0.0106, + "step": 1595 + }, + { + "epoch": 0.4218574912662316, + "grad_norm": 0.06223440542817116, + "learning_rate": 0.0001579100145137881, + "loss": 0.0113, + "step": 1600 + }, + { + "epoch": 0.4231757959264386, + "grad_norm": 0.0625135526061058, + "learning_rate": 0.00015777807098561817, + "loss": 0.0191, + "step": 1605 + }, + { + "epoch": 0.42449410058664555, + "grad_norm": 0.2645983099937439, + "learning_rate": 0.00015764612745744822, + "loss": 0.0829, + "step": 1610 + }, + { + "epoch": 0.42581240524685254, + "grad_norm": 0.009632415138185024, + "learning_rate": 0.00015751418392927829, + "loss": 0.0542, + "step": 1615 + }, + { + "epoch": 0.42713070990705954, + "grad_norm": 0.01979319378733635, + "learning_rate": 0.00015738224040110833, + "loss": 0.0517, + "step": 1620 + }, + { + "epoch": 0.4284490145672665, + "grad_norm": 0.3065454065799713, + "learning_rate": 0.0001572502968729384, + "loss": 0.0738, + "step": 1625 + }, + { + "epoch": 0.42976731922747347, + "grad_norm": 0.09581473469734192, + "learning_rate": 0.00015711835334476847, + "loss": 0.0571, + "step": 1630 + }, + { + "epoch": 0.43108562388768046, + "grad_norm": 0.23746591806411743, + "learning_rate": 0.0001569864098165985, + "loss": 0.0128, + "step": 1635 + }, + { + "epoch": 0.4324039285478874, + "grad_norm": 0.936278760433197, + "learning_rate": 0.00015685446628842858, + "loss": 0.0665, + "step": 1640 + }, + { + "epoch": 0.4337222332080944, + "grad_norm": 0.18487441539764404, + "learning_rate": 0.00015672252276025862, + "loss": 0.0527, + "step": 1645 + }, + { + "epoch": 0.4350405378683014, + "grad_norm": 0.6980624794960022, + "learning_rate": 0.00015659057923208866, + "loss": 0.0613, + "step": 1650 + }, + { + "epoch": 0.4363588425285083, + "grad_norm": 0.4696301221847534, + "learning_rate": 0.00015645863570391873, + "loss": 0.0569, + "step": 1655 + }, + { + "epoch": 0.4376771471887153, + "grad_norm": 0.15083105862140656, + "learning_rate": 0.00015632669217574877, + "loss": 0.0394, + "step": 1660 + }, + { + "epoch": 0.4389954518489223, + "grad_norm": 0.44701239466667175, + "learning_rate": 0.00015619474864757884, + "loss": 0.0494, + "step": 1665 + }, + { + "epoch": 0.44031375650912924, + "grad_norm": 0.07418403029441833, + "learning_rate": 0.00015606280511940888, + "loss": 0.0291, + "step": 1670 + }, + { + "epoch": 0.44163206116933623, + "grad_norm": 0.02311861515045166, + "learning_rate": 0.00015593086159123895, + "loss": 0.0304, + "step": 1675 + }, + { + "epoch": 0.4429503658295432, + "grad_norm": 0.4416038990020752, + "learning_rate": 0.00015579891806306902, + "loss": 0.0176, + "step": 1680 + }, + { + "epoch": 0.44426867048975016, + "grad_norm": 0.5124915242195129, + "learning_rate": 0.00015566697453489906, + "loss": 0.0454, + "step": 1685 + }, + { + "epoch": 0.44558697514995715, + "grad_norm": 0.3159286081790924, + "learning_rate": 0.00015553503100672913, + "loss": 0.047, + "step": 1690 + }, + { + "epoch": 0.44690527981016415, + "grad_norm": 0.032126396894454956, + "learning_rate": 0.00015540308747855918, + "loss": 0.0151, + "step": 1695 + }, + { + "epoch": 0.4482235844703711, + "grad_norm": 0.04663548618555069, + "learning_rate": 0.00015527114395038924, + "loss": 0.0375, + "step": 1700 + }, + { + "epoch": 0.4495418891305781, + "grad_norm": 0.013753900304436684, + "learning_rate": 0.0001551392004222193, + "loss": 0.0485, + "step": 1705 + }, + { + "epoch": 0.45086019379078507, + "grad_norm": 1.9952393770217896, + "learning_rate": 0.00015500725689404936, + "loss": 0.0625, + "step": 1710 + }, + { + "epoch": 0.452178498450992, + "grad_norm": 0.014283270575106144, + "learning_rate": 0.00015487531336587943, + "loss": 0.0037, + "step": 1715 + }, + { + "epoch": 0.453496803111199, + "grad_norm": 0.3897913098335266, + "learning_rate": 0.00015474336983770947, + "loss": 0.0304, + "step": 1720 + }, + { + "epoch": 0.454815107771406, + "grad_norm": 0.3730885684490204, + "learning_rate": 0.00015461142630953954, + "loss": 0.0115, + "step": 1725 + }, + { + "epoch": 0.45613341243161293, + "grad_norm": 0.035858724266290665, + "learning_rate": 0.00015447948278136958, + "loss": 0.0021, + "step": 1730 + }, + { + "epoch": 0.4574517170918199, + "grad_norm": 0.20589517056941986, + "learning_rate": 0.00015434753925319965, + "loss": 0.0132, + "step": 1735 + }, + { + "epoch": 0.4587700217520269, + "grad_norm": 0.004939342383295298, + "learning_rate": 0.00015421559572502972, + "loss": 0.0471, + "step": 1740 + }, + { + "epoch": 0.46008832641223385, + "grad_norm": 0.03493283689022064, + "learning_rate": 0.00015408365219685976, + "loss": 0.0062, + "step": 1745 + }, + { + "epoch": 0.46140663107244084, + "grad_norm": 0.045927103608846664, + "learning_rate": 0.0001539517086686898, + "loss": 0.0283, + "step": 1750 + }, + { + "epoch": 0.46272493573264784, + "grad_norm": 0.012629454955458641, + "learning_rate": 0.00015381976514051984, + "loss": 0.0133, + "step": 1755 + }, + { + "epoch": 0.46404324039285477, + "grad_norm": 0.8001697659492493, + "learning_rate": 0.0001536878216123499, + "loss": 0.0224, + "step": 1760 + }, + { + "epoch": 0.46536154505306176, + "grad_norm": 0.002036362886428833, + "learning_rate": 0.00015355587808417998, + "loss": 0.0066, + "step": 1765 + }, + { + "epoch": 0.46667984971326876, + "grad_norm": 1.0261330604553223, + "learning_rate": 0.00015342393455601002, + "loss": 0.191, + "step": 1770 + }, + { + "epoch": 0.4679981543734757, + "grad_norm": 0.3033429682254791, + "learning_rate": 0.0001532919910278401, + "loss": 0.0222, + "step": 1775 + }, + { + "epoch": 0.4693164590336827, + "grad_norm": 0.36911338567733765, + "learning_rate": 0.00015316004749967014, + "loss": 0.0363, + "step": 1780 + }, + { + "epoch": 0.4706347636938897, + "grad_norm": 0.0406811460852623, + "learning_rate": 0.0001530281039715002, + "loss": 0.0283, + "step": 1785 + }, + { + "epoch": 0.4719530683540966, + "grad_norm": 0.23334211111068726, + "learning_rate": 0.00015289616044333027, + "loss": 0.0274, + "step": 1790 + }, + { + "epoch": 0.4732713730143036, + "grad_norm": 0.013081169687211514, + "learning_rate": 0.00015276421691516032, + "loss": 0.0221, + "step": 1795 + }, + { + "epoch": 0.4745896776745106, + "grad_norm": 0.2480790615081787, + "learning_rate": 0.00015263227338699039, + "loss": 0.019, + "step": 1800 + }, + { + "epoch": 0.47590798233471754, + "grad_norm": 0.0373196005821228, + "learning_rate": 0.00015250032985882043, + "loss": 0.0292, + "step": 1805 + }, + { + "epoch": 0.47722628699492453, + "grad_norm": 0.004609994124621153, + "learning_rate": 0.0001523683863306505, + "loss": 0.0918, + "step": 1810 + }, + { + "epoch": 0.4785445916551315, + "grad_norm": 0.02370987832546234, + "learning_rate": 0.00015223644280248054, + "loss": 0.0462, + "step": 1815 + }, + { + "epoch": 0.47986289631533846, + "grad_norm": 0.05842221528291702, + "learning_rate": 0.0001521044992743106, + "loss": 0.0595, + "step": 1820 + }, + { + "epoch": 0.48118120097554545, + "grad_norm": 0.009685276076197624, + "learning_rate": 0.00015197255574614068, + "loss": 0.0074, + "step": 1825 + }, + { + "epoch": 0.48249950563575245, + "grad_norm": 0.8933250308036804, + "learning_rate": 0.00015184061221797072, + "loss": 0.0757, + "step": 1830 + }, + { + "epoch": 0.4838178102959594, + "grad_norm": 0.07075401395559311, + "learning_rate": 0.0001517086686898008, + "loss": 0.0226, + "step": 1835 + }, + { + "epoch": 0.4851361149561664, + "grad_norm": 0.732706606388092, + "learning_rate": 0.00015157672516163083, + "loss": 0.0161, + "step": 1840 + }, + { + "epoch": 0.48645441961637337, + "grad_norm": 1.1897023916244507, + "learning_rate": 0.0001514447816334609, + "loss": 0.0265, + "step": 1845 + }, + { + "epoch": 0.4877727242765803, + "grad_norm": 0.052572328597307205, + "learning_rate": 0.00015131283810529094, + "loss": 0.0094, + "step": 1850 + }, + { + "epoch": 0.4890910289367873, + "grad_norm": 0.08263898640871048, + "learning_rate": 0.00015118089457712098, + "loss": 0.0631, + "step": 1855 + }, + { + "epoch": 0.4904093335969943, + "grad_norm": 0.03225664421916008, + "learning_rate": 0.00015104895104895105, + "loss": 0.023, + "step": 1860 + }, + { + "epoch": 0.4917276382572012, + "grad_norm": 0.007935039699077606, + "learning_rate": 0.0001509170075207811, + "loss": 0.0039, + "step": 1865 + }, + { + "epoch": 0.4930459429174082, + "grad_norm": 0.00830796267837286, + "learning_rate": 0.00015078506399261116, + "loss": 0.007, + "step": 1870 + }, + { + "epoch": 0.4943642475776152, + "grad_norm": 0.08042234182357788, + "learning_rate": 0.00015065312046444123, + "loss": 0.0366, + "step": 1875 + }, + { + "epoch": 0.49568255223782215, + "grad_norm": 0.009092851541936398, + "learning_rate": 0.00015052117693627128, + "loss": 0.0107, + "step": 1880 + }, + { + "epoch": 0.49700085689802914, + "grad_norm": 0.2674141824245453, + "learning_rate": 0.00015038923340810135, + "loss": 0.0076, + "step": 1885 + }, + { + "epoch": 0.49831916155823613, + "grad_norm": 0.07694366574287415, + "learning_rate": 0.0001502572898799314, + "loss": 0.0252, + "step": 1890 + }, + { + "epoch": 0.49963746621844307, + "grad_norm": 0.5699467062950134, + "learning_rate": 0.00015012534635176146, + "loss": 0.0487, + "step": 1895 + }, + { + "epoch": 0.5009557708786501, + "grad_norm": 0.18800878524780273, + "learning_rate": 0.0001499934028235915, + "loss": 0.0183, + "step": 1900 + }, + { + "epoch": 0.5022740755388571, + "grad_norm": 0.019469989463686943, + "learning_rate": 0.00014986145929542157, + "loss": 0.0268, + "step": 1905 + }, + { + "epoch": 0.503592380199064, + "grad_norm": 0.01890506222844124, + "learning_rate": 0.00014972951576725164, + "loss": 0.0449, + "step": 1910 + }, + { + "epoch": 0.5049106848592709, + "grad_norm": 0.0006314461352303624, + "learning_rate": 0.00014959757223908168, + "loss": 0.0056, + "step": 1915 + }, + { + "epoch": 0.5062289895194779, + "grad_norm": 0.32654041051864624, + "learning_rate": 0.00014946562871091175, + "loss": 0.0256, + "step": 1920 + }, + { + "epoch": 0.5075472941796849, + "grad_norm": 0.7803483605384827, + "learning_rate": 0.0001493336851827418, + "loss": 0.0374, + "step": 1925 + }, + { + "epoch": 0.5088655988398919, + "grad_norm": 0.028441445901989937, + "learning_rate": 0.00014920174165457186, + "loss": 0.0161, + "step": 1930 + }, + { + "epoch": 0.5101839035000989, + "grad_norm": 0.028379200026392937, + "learning_rate": 0.00014906979812640193, + "loss": 0.0151, + "step": 1935 + }, + { + "epoch": 0.5115022081603059, + "grad_norm": 0.021159596741199493, + "learning_rate": 0.00014893785459823197, + "loss": 0.0303, + "step": 1940 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.24903325736522675, + "learning_rate": 0.000148805911070062, + "loss": 0.0076, + "step": 1945 + }, + { + "epoch": 0.5141388174807198, + "grad_norm": 0.007065301761031151, + "learning_rate": 0.00014867396754189206, + "loss": 0.022, + "step": 1950 + }, + { + "epoch": 0.5154571221409268, + "grad_norm": 0.004032329190522432, + "learning_rate": 0.00014854202401372212, + "loss": 0.0083, + "step": 1955 + }, + { + "epoch": 0.5167754268011338, + "grad_norm": 0.3045775592327118, + "learning_rate": 0.0001484100804855522, + "loss": 0.0113, + "step": 1960 + }, + { + "epoch": 0.5180937314613407, + "grad_norm": 0.36974939703941345, + "learning_rate": 0.00014827813695738224, + "loss": 0.0267, + "step": 1965 + }, + { + "epoch": 0.5194120361215477, + "grad_norm": 0.009729950688779354, + "learning_rate": 0.0001481461934292123, + "loss": 0.027, + "step": 1970 + }, + { + "epoch": 0.5207303407817546, + "grad_norm": 0.0013097926275804639, + "learning_rate": 0.00014801424990104235, + "loss": 0.003, + "step": 1975 + }, + { + "epoch": 0.5220486454419616, + "grad_norm": 0.0706263929605484, + "learning_rate": 0.00014788230637287242, + "loss": 0.0193, + "step": 1980 + }, + { + "epoch": 0.5233669501021686, + "grad_norm": 1.435702919960022, + "learning_rate": 0.00014775036284470249, + "loss": 0.0647, + "step": 1985 + }, + { + "epoch": 0.5246852547623756, + "grad_norm": 0.00661757867783308, + "learning_rate": 0.00014761841931653253, + "loss": 0.0373, + "step": 1990 + }, + { + "epoch": 0.5260035594225826, + "grad_norm": 0.12014541029930115, + "learning_rate": 0.0001474864757883626, + "loss": 0.0178, + "step": 1995 + }, + { + "epoch": 0.5273218640827896, + "grad_norm": 1.0549248456954956, + "learning_rate": 0.00014735453226019264, + "loss": 0.0191, + "step": 2000 + }, + { + "epoch": 0.5273218640827896, + "eval_loss": 0.037292081862688065, + "eval_runtime": 454.3033, + "eval_samples_per_second": 7.422, + "eval_steps_per_second": 3.711, + "step": 2000 + }, + { + "epoch": 0.5286401687429965, + "grad_norm": 0.47634151577949524, + "learning_rate": 0.0001472225887320227, + "loss": 0.0404, + "step": 2005 + }, + { + "epoch": 0.5299584734032035, + "grad_norm": 0.006752463988959789, + "learning_rate": 0.00014709064520385275, + "loss": 0.034, + "step": 2010 + }, + { + "epoch": 0.5312767780634104, + "grad_norm": 0.20780125260353088, + "learning_rate": 0.00014695870167568282, + "loss": 0.0421, + "step": 2015 + }, + { + "epoch": 0.5325950827236174, + "grad_norm": 0.010941066779196262, + "learning_rate": 0.0001468267581475129, + "loss": 0.0086, + "step": 2020 + }, + { + "epoch": 0.5339133873838244, + "grad_norm": 0.3439581096172333, + "learning_rate": 0.00014669481461934293, + "loss": 0.0187, + "step": 2025 + }, + { + "epoch": 0.5352316920440314, + "grad_norm": 0.14961636066436768, + "learning_rate": 0.000146562871091173, + "loss": 0.0504, + "step": 2030 + }, + { + "epoch": 0.5365499967042383, + "grad_norm": 0.0044641937129199505, + "learning_rate": 0.00014643092756300304, + "loss": 0.0134, + "step": 2035 + }, + { + "epoch": 0.5378683013644453, + "grad_norm": 0.14088386297225952, + "learning_rate": 0.0001462989840348331, + "loss": 0.0096, + "step": 2040 + }, + { + "epoch": 0.5391866060246523, + "grad_norm": 0.48116979002952576, + "learning_rate": 0.00014616704050666315, + "loss": 0.0124, + "step": 2045 + }, + { + "epoch": 0.5405049106848593, + "grad_norm": 0.3688766360282898, + "learning_rate": 0.0001460350969784932, + "loss": 0.0226, + "step": 2050 + }, + { + "epoch": 0.5418232153450663, + "grad_norm": 0.002938181860372424, + "learning_rate": 0.00014590315345032326, + "loss": 0.0267, + "step": 2055 + }, + { + "epoch": 0.5431415200052733, + "grad_norm": 0.3335214853286743, + "learning_rate": 0.0001457712099221533, + "loss": 0.0367, + "step": 2060 + }, + { + "epoch": 0.5444598246654802, + "grad_norm": 0.004644686821848154, + "learning_rate": 0.00014563926639398338, + "loss": 0.0121, + "step": 2065 + }, + { + "epoch": 0.5457781293256871, + "grad_norm": 0.19505545496940613, + "learning_rate": 0.00014550732286581345, + "loss": 0.0591, + "step": 2070 + }, + { + "epoch": 0.5470964339858941, + "grad_norm": 0.018028756603598595, + "learning_rate": 0.0001453753793376435, + "loss": 0.0131, + "step": 2075 + }, + { + "epoch": 0.5484147386461011, + "grad_norm": 0.045639291405677795, + "learning_rate": 0.00014524343580947356, + "loss": 0.0443, + "step": 2080 + }, + { + "epoch": 0.5497330433063081, + "grad_norm": 0.727981686592102, + "learning_rate": 0.0001451114922813036, + "loss": 0.0205, + "step": 2085 + }, + { + "epoch": 0.5510513479665151, + "grad_norm": 0.03766491636633873, + "learning_rate": 0.00014497954875313367, + "loss": 0.0067, + "step": 2090 + }, + { + "epoch": 0.552369652626722, + "grad_norm": 0.1911504715681076, + "learning_rate": 0.0001448476052249637, + "loss": 0.0397, + "step": 2095 + }, + { + "epoch": 0.553687957286929, + "grad_norm": 0.08238353580236435, + "learning_rate": 0.00014471566169679378, + "loss": 0.0513, + "step": 2100 + }, + { + "epoch": 0.555006261947136, + "grad_norm": 0.06317206472158432, + "learning_rate": 0.00014458371816862385, + "loss": 0.0178, + "step": 2105 + }, + { + "epoch": 0.556324566607343, + "grad_norm": 0.0652734637260437, + "learning_rate": 0.0001444517746404539, + "loss": 0.0184, + "step": 2110 + }, + { + "epoch": 0.55764287126755, + "grad_norm": 0.05471858009696007, + "learning_rate": 0.00014431983111228396, + "loss": 0.0089, + "step": 2115 + }, + { + "epoch": 0.558961175927757, + "grad_norm": 0.005062670446932316, + "learning_rate": 0.000144187887584114, + "loss": 0.0052, + "step": 2120 + }, + { + "epoch": 0.5602794805879638, + "grad_norm": 0.06337414681911469, + "learning_rate": 0.00014405594405594407, + "loss": 0.053, + "step": 2125 + }, + { + "epoch": 0.5615977852481708, + "grad_norm": 0.33745357394218445, + "learning_rate": 0.00014392400052777414, + "loss": 0.0166, + "step": 2130 + }, + { + "epoch": 0.5629160899083778, + "grad_norm": 0.7382741570472717, + "learning_rate": 0.00014379205699960418, + "loss": 0.0191, + "step": 2135 + }, + { + "epoch": 0.5642343945685848, + "grad_norm": 0.007551972754299641, + "learning_rate": 0.00014366011347143425, + "loss": 0.0022, + "step": 2140 + }, + { + "epoch": 0.5655526992287918, + "grad_norm": 0.6260896921157837, + "learning_rate": 0.00014352816994326427, + "loss": 0.0095, + "step": 2145 + }, + { + "epoch": 0.5668710038889987, + "grad_norm": 0.11619322001934052, + "learning_rate": 0.00014339622641509434, + "loss": 0.015, + "step": 2150 + }, + { + "epoch": 0.5681893085492057, + "grad_norm": 1.1440670490264893, + "learning_rate": 0.0001432642828869244, + "loss": 0.1343, + "step": 2155 + }, + { + "epoch": 0.5695076132094127, + "grad_norm": 1.1793878078460693, + "learning_rate": 0.00014313233935875445, + "loss": 0.0968, + "step": 2160 + }, + { + "epoch": 0.5708259178696197, + "grad_norm": 0.6865736842155457, + "learning_rate": 0.00014300039583058452, + "loss": 0.0195, + "step": 2165 + }, + { + "epoch": 0.5721442225298267, + "grad_norm": 0.140816792845726, + "learning_rate": 0.00014286845230241456, + "loss": 0.0761, + "step": 2170 + }, + { + "epoch": 0.5734625271900337, + "grad_norm": 0.04071786254644394, + "learning_rate": 0.00014273650877424463, + "loss": 0.0193, + "step": 2175 + }, + { + "epoch": 0.5747808318502405, + "grad_norm": 0.044617727398872375, + "learning_rate": 0.0001426045652460747, + "loss": 0.0112, + "step": 2180 + }, + { + "epoch": 0.5760991365104475, + "grad_norm": 0.11001799255609512, + "learning_rate": 0.00014247262171790474, + "loss": 0.0039, + "step": 2185 + }, + { + "epoch": 0.5774174411706545, + "grad_norm": 0.0036315324250608683, + "learning_rate": 0.0001423406781897348, + "loss": 0.0038, + "step": 2190 + }, + { + "epoch": 0.5787357458308615, + "grad_norm": 0.9866570830345154, + "learning_rate": 0.00014220873466156485, + "loss": 0.025, + "step": 2195 + }, + { + "epoch": 0.5800540504910685, + "grad_norm": 0.023570384830236435, + "learning_rate": 0.00014207679113339492, + "loss": 0.0468, + "step": 2200 + }, + { + "epoch": 0.5813723551512755, + "grad_norm": 0.20010559260845184, + "learning_rate": 0.00014194484760522496, + "loss": 0.0198, + "step": 2205 + }, + { + "epoch": 0.5826906598114824, + "grad_norm": 0.06153270602226257, + "learning_rate": 0.00014181290407705503, + "loss": 0.0764, + "step": 2210 + }, + { + "epoch": 0.5840089644716894, + "grad_norm": 0.033162448555231094, + "learning_rate": 0.0001416809605488851, + "loss": 0.028, + "step": 2215 + }, + { + "epoch": 0.5853272691318964, + "grad_norm": 0.428382933139801, + "learning_rate": 0.00014154901702071514, + "loss": 0.0652, + "step": 2220 + }, + { + "epoch": 0.5866455737921034, + "grad_norm": 0.25004762411117554, + "learning_rate": 0.0001414170734925452, + "loss": 0.0411, + "step": 2225 + }, + { + "epoch": 0.5879638784523104, + "grad_norm": 0.22649863362312317, + "learning_rate": 0.00014128512996437525, + "loss": 0.0517, + "step": 2230 + }, + { + "epoch": 0.5892821831125173, + "grad_norm": 0.035932112485170364, + "learning_rate": 0.00014115318643620532, + "loss": 0.015, + "step": 2235 + }, + { + "epoch": 0.5906004877727242, + "grad_norm": 0.3800172507762909, + "learning_rate": 0.00014102124290803536, + "loss": 0.0324, + "step": 2240 + }, + { + "epoch": 0.5919187924329312, + "grad_norm": 0.6974118947982788, + "learning_rate": 0.0001408892993798654, + "loss": 0.0216, + "step": 2245 + }, + { + "epoch": 0.5932370970931382, + "grad_norm": 0.15472032129764557, + "learning_rate": 0.00014075735585169548, + "loss": 0.0164, + "step": 2250 + }, + { + "epoch": 0.5945554017533452, + "grad_norm": 0.015000814571976662, + "learning_rate": 0.00014062541232352552, + "loss": 0.0395, + "step": 2255 + }, + { + "epoch": 0.5958737064135522, + "grad_norm": 0.052086081355810165, + "learning_rate": 0.0001404934687953556, + "loss": 0.0032, + "step": 2260 + }, + { + "epoch": 0.5971920110737592, + "grad_norm": 0.004600350745022297, + "learning_rate": 0.00014036152526718566, + "loss": 0.0056, + "step": 2265 + }, + { + "epoch": 0.5985103157339661, + "grad_norm": 0.4940958321094513, + "learning_rate": 0.0001402295817390157, + "loss": 0.0206, + "step": 2270 + }, + { + "epoch": 0.5998286203941731, + "grad_norm": 0.09658394008874893, + "learning_rate": 0.00014009763821084577, + "loss": 0.0052, + "step": 2275 + }, + { + "epoch": 0.60114692505438, + "grad_norm": 0.00020539117394946516, + "learning_rate": 0.0001399656946826758, + "loss": 0.087, + "step": 2280 + }, + { + "epoch": 0.602465229714587, + "grad_norm": 0.1871018409729004, + "learning_rate": 0.00013983375115450588, + "loss": 0.0812, + "step": 2285 + }, + { + "epoch": 0.603783534374794, + "grad_norm": 0.02583954855799675, + "learning_rate": 0.00013970180762633592, + "loss": 0.0232, + "step": 2290 + }, + { + "epoch": 0.605101839035001, + "grad_norm": 1.2103784084320068, + "learning_rate": 0.000139569864098166, + "loss": 0.0151, + "step": 2295 + }, + { + "epoch": 0.6064201436952079, + "grad_norm": 0.023514943197369576, + "learning_rate": 0.00013943792056999606, + "loss": 0.0193, + "step": 2300 + }, + { + "epoch": 0.6077384483554149, + "grad_norm": 0.0076395305804908276, + "learning_rate": 0.0001393059770418261, + "loss": 0.0379, + "step": 2305 + }, + { + "epoch": 0.6090567530156219, + "grad_norm": 0.12412039190530777, + "learning_rate": 0.00013917403351365617, + "loss": 0.0095, + "step": 2310 + }, + { + "epoch": 0.6103750576758289, + "grad_norm": 0.021904783323407173, + "learning_rate": 0.0001390420899854862, + "loss": 0.0166, + "step": 2315 + }, + { + "epoch": 0.6116933623360359, + "grad_norm": 0.004012851510196924, + "learning_rate": 0.00013891014645731628, + "loss": 0.0103, + "step": 2320 + }, + { + "epoch": 0.6130116669962429, + "grad_norm": 0.007267913781106472, + "learning_rate": 0.00013877820292914635, + "loss": 0.0708, + "step": 2325 + }, + { + "epoch": 0.6143299716564498, + "grad_norm": 0.10363642126321793, + "learning_rate": 0.0001386462594009764, + "loss": 0.0473, + "step": 2330 + }, + { + "epoch": 0.6156482763166568, + "grad_norm": 0.04899830371141434, + "learning_rate": 0.00013851431587280646, + "loss": 0.0283, + "step": 2335 + }, + { + "epoch": 0.6169665809768637, + "grad_norm": 0.39460498094558716, + "learning_rate": 0.0001383823723446365, + "loss": 0.0597, + "step": 2340 + }, + { + "epoch": 0.6182848856370707, + "grad_norm": 0.04092290997505188, + "learning_rate": 0.00013825042881646655, + "loss": 0.0167, + "step": 2345 + }, + { + "epoch": 0.6196031902972777, + "grad_norm": 0.2781132161617279, + "learning_rate": 0.00013811848528829662, + "loss": 0.0097, + "step": 2350 + }, + { + "epoch": 0.6209214949574847, + "grad_norm": 0.041443537920713425, + "learning_rate": 0.00013798654176012666, + "loss": 0.0226, + "step": 2355 + }, + { + "epoch": 0.6222397996176916, + "grad_norm": 0.1242462694644928, + "learning_rate": 0.00013785459823195673, + "loss": 0.0055, + "step": 2360 + }, + { + "epoch": 0.6235581042778986, + "grad_norm": 0.4440467357635498, + "learning_rate": 0.00013772265470378677, + "loss": 0.049, + "step": 2365 + }, + { + "epoch": 0.6248764089381056, + "grad_norm": 0.014354427345097065, + "learning_rate": 0.00013759071117561684, + "loss": 0.0327, + "step": 2370 + }, + { + "epoch": 0.6261947135983126, + "grad_norm": 0.011539973318576813, + "learning_rate": 0.0001374587676474469, + "loss": 0.0222, + "step": 2375 + }, + { + "epoch": 0.6275130182585196, + "grad_norm": 0.23539051413536072, + "learning_rate": 0.00013732682411927695, + "loss": 0.0816, + "step": 2380 + }, + { + "epoch": 0.6288313229187266, + "grad_norm": 0.26793941855430603, + "learning_rate": 0.00013719488059110702, + "loss": 0.0325, + "step": 2385 + }, + { + "epoch": 0.6301496275789334, + "grad_norm": 0.01662217453122139, + "learning_rate": 0.00013706293706293706, + "loss": 0.0221, + "step": 2390 + }, + { + "epoch": 0.6314679322391404, + "grad_norm": 0.30669671297073364, + "learning_rate": 0.00013693099353476713, + "loss": 0.026, + "step": 2395 + }, + { + "epoch": 0.6327862368993474, + "grad_norm": 0.03350894898176193, + "learning_rate": 0.00013679905000659717, + "loss": 0.0072, + "step": 2400 + }, + { + "epoch": 0.6341045415595544, + "grad_norm": 0.014983875676989555, + "learning_rate": 0.00013666710647842724, + "loss": 0.049, + "step": 2405 + }, + { + "epoch": 0.6354228462197614, + "grad_norm": 1.8989384174346924, + "learning_rate": 0.0001365351629502573, + "loss": 0.0335, + "step": 2410 + }, + { + "epoch": 0.6367411508799684, + "grad_norm": 0.030135562643408775, + "learning_rate": 0.00013640321942208735, + "loss": 0.0051, + "step": 2415 + }, + { + "epoch": 0.6380594555401753, + "grad_norm": 0.02079075388610363, + "learning_rate": 0.00013627127589391742, + "loss": 0.0138, + "step": 2420 + }, + { + "epoch": 0.6393777602003823, + "grad_norm": 0.06065403297543526, + "learning_rate": 0.00013613933236574746, + "loss": 0.0357, + "step": 2425 + }, + { + "epoch": 0.6406960648605893, + "grad_norm": 0.2980937659740448, + "learning_rate": 0.00013600738883757753, + "loss": 0.0138, + "step": 2430 + }, + { + "epoch": 0.6420143695207963, + "grad_norm": 0.4820438623428345, + "learning_rate": 0.00013587544530940758, + "loss": 0.01, + "step": 2435 + }, + { + "epoch": 0.6433326741810033, + "grad_norm": 0.005618259310722351, + "learning_rate": 0.00013574350178123765, + "loss": 0.0052, + "step": 2440 + }, + { + "epoch": 0.6446509788412103, + "grad_norm": 0.7173821926116943, + "learning_rate": 0.0001356115582530677, + "loss": 0.0133, + "step": 2445 + }, + { + "epoch": 0.6459692835014171, + "grad_norm": 0.0053142281249165535, + "learning_rate": 0.00013547961472489773, + "loss": 0.0045, + "step": 2450 + }, + { + "epoch": 0.6472875881616241, + "grad_norm": 0.06118829548358917, + "learning_rate": 0.0001353476711967278, + "loss": 0.056, + "step": 2455 + }, + { + "epoch": 0.6486058928218311, + "grad_norm": 3.5878078937530518, + "learning_rate": 0.00013521572766855787, + "loss": 0.0232, + "step": 2460 + }, + { + "epoch": 0.6499241974820381, + "grad_norm": 0.004911276511847973, + "learning_rate": 0.0001350837841403879, + "loss": 0.0074, + "step": 2465 + }, + { + "epoch": 0.6512425021422451, + "grad_norm": 0.0028026222717016935, + "learning_rate": 0.00013495184061221798, + "loss": 0.0782, + "step": 2470 + }, + { + "epoch": 0.6525608068024521, + "grad_norm": 0.7317615747451782, + "learning_rate": 0.00013481989708404802, + "loss": 0.0222, + "step": 2475 + }, + { + "epoch": 0.653879111462659, + "grad_norm": 0.01835751160979271, + "learning_rate": 0.0001346879535558781, + "loss": 0.0661, + "step": 2480 + }, + { + "epoch": 0.655197416122866, + "grad_norm": 0.03598962351679802, + "learning_rate": 0.00013455601002770813, + "loss": 0.0395, + "step": 2485 + }, + { + "epoch": 0.656515720783073, + "grad_norm": 0.013886351138353348, + "learning_rate": 0.0001344240664995382, + "loss": 0.0156, + "step": 2490 + }, + { + "epoch": 0.65783402544328, + "grad_norm": 5.741530895233154, + "learning_rate": 0.00013429212297136827, + "loss": 0.0317, + "step": 2495 + }, + { + "epoch": 0.659152330103487, + "grad_norm": 0.20793496072292328, + "learning_rate": 0.0001341601794431983, + "loss": 0.0072, + "step": 2500 + }, + { + "epoch": 0.659152330103487, + "eval_loss": 0.0300898440182209, + "eval_runtime": 453.0554, + "eval_samples_per_second": 7.443, + "eval_steps_per_second": 3.721, + "step": 2500 + }, + { + "epoch": 0.6604706347636939, + "grad_norm": 0.03460961952805519, + "learning_rate": 0.00013402823591502838, + "loss": 0.0097, + "step": 2505 + }, + { + "epoch": 0.6617889394239008, + "grad_norm": 0.31785696744918823, + "learning_rate": 0.00013389629238685842, + "loss": 0.0303, + "step": 2510 + }, + { + "epoch": 0.6631072440841078, + "grad_norm": 0.4273851215839386, + "learning_rate": 0.0001337643488586885, + "loss": 0.0499, + "step": 2515 + }, + { + "epoch": 0.6644255487443148, + "grad_norm": 0.02236153744161129, + "learning_rate": 0.00013363240533051856, + "loss": 0.0069, + "step": 2520 + }, + { + "epoch": 0.6657438534045218, + "grad_norm": 0.1592864990234375, + "learning_rate": 0.0001335004618023486, + "loss": 0.0326, + "step": 2525 + }, + { + "epoch": 0.6670621580647288, + "grad_norm": 0.029961545020341873, + "learning_rate": 0.00013336851827417867, + "loss": 0.0178, + "step": 2530 + }, + { + "epoch": 0.6683804627249358, + "grad_norm": 0.03120764158666134, + "learning_rate": 0.00013323657474600872, + "loss": 0.115, + "step": 2535 + }, + { + "epoch": 0.6696987673851427, + "grad_norm": 0.01060028001666069, + "learning_rate": 0.00013310463121783879, + "loss": 0.0036, + "step": 2540 + }, + { + "epoch": 0.6710170720453497, + "grad_norm": 0.053470809012651443, + "learning_rate": 0.00013297268768966883, + "loss": 0.0079, + "step": 2545 + }, + { + "epoch": 0.6723353767055567, + "grad_norm": 0.022777097299695015, + "learning_rate": 0.00013284074416149887, + "loss": 0.0078, + "step": 2550 + }, + { + "epoch": 0.6736536813657636, + "grad_norm": 0.0548521913588047, + "learning_rate": 0.00013270880063332894, + "loss": 0.0503, + "step": 2555 + }, + { + "epoch": 0.6749719860259706, + "grad_norm": 0.02028457075357437, + "learning_rate": 0.00013257685710515898, + "loss": 0.0096, + "step": 2560 + }, + { + "epoch": 0.6762902906861776, + "grad_norm": 0.01569107361137867, + "learning_rate": 0.00013244491357698905, + "loss": 0.008, + "step": 2565 + }, + { + "epoch": 0.6776085953463845, + "grad_norm": 0.00743742985650897, + "learning_rate": 0.00013231297004881912, + "loss": 0.005, + "step": 2570 + }, + { + "epoch": 0.6789269000065915, + "grad_norm": 0.025164416059851646, + "learning_rate": 0.00013218102652064916, + "loss": 0.018, + "step": 2575 + }, + { + "epoch": 0.6802452046667985, + "grad_norm": 0.3653188645839691, + "learning_rate": 0.00013204908299247923, + "loss": 0.0295, + "step": 2580 + }, + { + "epoch": 0.6815635093270055, + "grad_norm": 0.685422956943512, + "learning_rate": 0.00013191713946430927, + "loss": 0.0335, + "step": 2585 + }, + { + "epoch": 0.6828818139872125, + "grad_norm": 0.675740122795105, + "learning_rate": 0.00013178519593613934, + "loss": 0.0592, + "step": 2590 + }, + { + "epoch": 0.6842001186474194, + "grad_norm": 0.10513252764940262, + "learning_rate": 0.00013165325240796938, + "loss": 0.0353, + "step": 2595 + }, + { + "epoch": 0.6855184233076264, + "grad_norm": 0.43512973189353943, + "learning_rate": 0.00013152130887979945, + "loss": 0.0142, + "step": 2600 + }, + { + "epoch": 0.6868367279678333, + "grad_norm": 0.029436839744448662, + "learning_rate": 0.00013138936535162952, + "loss": 0.0042, + "step": 2605 + }, + { + "epoch": 0.6881550326280403, + "grad_norm": 0.5607122778892517, + "learning_rate": 0.00013125742182345957, + "loss": 0.0184, + "step": 2610 + }, + { + "epoch": 0.6894733372882473, + "grad_norm": 0.11365406215190887, + "learning_rate": 0.00013112547829528963, + "loss": 0.006, + "step": 2615 + }, + { + "epoch": 0.6907916419484543, + "grad_norm": 0.047227244824171066, + "learning_rate": 0.00013099353476711968, + "loss": 0.008, + "step": 2620 + }, + { + "epoch": 0.6921099466086612, + "grad_norm": 0.0005877618095837533, + "learning_rate": 0.00013086159123894975, + "loss": 0.0286, + "step": 2625 + }, + { + "epoch": 0.6934282512688682, + "grad_norm": 0.010759112425148487, + "learning_rate": 0.0001307296477107798, + "loss": 0.0062, + "step": 2630 + }, + { + "epoch": 0.6947465559290752, + "grad_norm": 0.07117745280265808, + "learning_rate": 0.00013059770418260986, + "loss": 0.0891, + "step": 2635 + }, + { + "epoch": 0.6960648605892822, + "grad_norm": 0.0639057606458664, + "learning_rate": 0.00013046576065443993, + "loss": 0.0072, + "step": 2640 + }, + { + "epoch": 0.6973831652494892, + "grad_norm": 0.027350090444087982, + "learning_rate": 0.00013033381712626994, + "loss": 0.0103, + "step": 2645 + }, + { + "epoch": 0.6987014699096962, + "grad_norm": 0.015336195938289165, + "learning_rate": 0.0001302018735981, + "loss": 0.0041, + "step": 2650 + }, + { + "epoch": 0.700019774569903, + "grad_norm": 1.0650830268859863, + "learning_rate": 0.00013006993006993008, + "loss": 0.0443, + "step": 2655 + }, + { + "epoch": 0.70133807923011, + "grad_norm": 0.019073212519288063, + "learning_rate": 0.00012993798654176012, + "loss": 0.0331, + "step": 2660 + }, + { + "epoch": 0.702656383890317, + "grad_norm": 0.10109209269285202, + "learning_rate": 0.0001298060430135902, + "loss": 0.0054, + "step": 2665 + }, + { + "epoch": 0.703974688550524, + "grad_norm": 0.03528957813978195, + "learning_rate": 0.00012967409948542023, + "loss": 0.0427, + "step": 2670 + }, + { + "epoch": 0.705292993210731, + "grad_norm": 0.03577788919210434, + "learning_rate": 0.0001295421559572503, + "loss": 0.023, + "step": 2675 + }, + { + "epoch": 0.706611297870938, + "grad_norm": 0.5576180815696716, + "learning_rate": 0.00012941021242908034, + "loss": 0.0416, + "step": 2680 + }, + { + "epoch": 0.7079296025311449, + "grad_norm": 0.017131298780441284, + "learning_rate": 0.0001292782689009104, + "loss": 0.0235, + "step": 2685 + }, + { + "epoch": 0.7092479071913519, + "grad_norm": 0.8517888784408569, + "learning_rate": 0.00012914632537274048, + "loss": 0.0168, + "step": 2690 + }, + { + "epoch": 0.7105662118515589, + "grad_norm": 0.23812156915664673, + "learning_rate": 0.00012901438184457052, + "loss": 0.0483, + "step": 2695 + }, + { + "epoch": 0.7118845165117659, + "grad_norm": 0.11746613681316376, + "learning_rate": 0.0001288824383164006, + "loss": 0.0255, + "step": 2700 + }, + { + "epoch": 0.7132028211719729, + "grad_norm": 0.20089928805828094, + "learning_rate": 0.00012875049478823064, + "loss": 0.0267, + "step": 2705 + }, + { + "epoch": 0.7145211258321799, + "grad_norm": 0.8301129937171936, + "learning_rate": 0.0001286185512600607, + "loss": 0.016, + "step": 2710 + }, + { + "epoch": 0.7158394304923867, + "grad_norm": 0.01838674768805504, + "learning_rate": 0.00012848660773189077, + "loss": 0.0229, + "step": 2715 + }, + { + "epoch": 0.7171577351525937, + "grad_norm": 0.03670337051153183, + "learning_rate": 0.00012835466420372082, + "loss": 0.038, + "step": 2720 + }, + { + "epoch": 0.7184760398128007, + "grad_norm": 0.0452633760869503, + "learning_rate": 0.00012822272067555089, + "loss": 0.0622, + "step": 2725 + }, + { + "epoch": 0.7197943444730077, + "grad_norm": 0.09503110498189926, + "learning_rate": 0.00012809077714738093, + "loss": 0.0209, + "step": 2730 + }, + { + "epoch": 0.7211126491332147, + "grad_norm": 1.0327308177947998, + "learning_rate": 0.000127958833619211, + "loss": 0.0361, + "step": 2735 + }, + { + "epoch": 0.7224309537934217, + "grad_norm": 1.0049290657043457, + "learning_rate": 0.00012782689009104104, + "loss": 0.0365, + "step": 2740 + }, + { + "epoch": 0.7237492584536286, + "grad_norm": 0.029774073511362076, + "learning_rate": 0.00012769494656287108, + "loss": 0.0257, + "step": 2745 + }, + { + "epoch": 0.7250675631138356, + "grad_norm": 0.20974040031433105, + "learning_rate": 0.00012756300303470115, + "loss": 0.0542, + "step": 2750 + }, + { + "epoch": 0.7263858677740426, + "grad_norm": 0.8153854608535767, + "learning_rate": 0.0001274310595065312, + "loss": 0.0216, + "step": 2755 + }, + { + "epoch": 0.7277041724342496, + "grad_norm": 0.4393698573112488, + "learning_rate": 0.00012729911597836126, + "loss": 0.0451, + "step": 2760 + }, + { + "epoch": 0.7290224770944566, + "grad_norm": 0.06990349292755127, + "learning_rate": 0.00012716717245019133, + "loss": 0.03, + "step": 2765 + }, + { + "epoch": 0.7303407817546635, + "grad_norm": 0.32689470052719116, + "learning_rate": 0.00012703522892202137, + "loss": 0.0263, + "step": 2770 + }, + { + "epoch": 0.7316590864148704, + "grad_norm": 0.026600876823067665, + "learning_rate": 0.00012690328539385144, + "loss": 0.0404, + "step": 2775 + }, + { + "epoch": 0.7329773910750774, + "grad_norm": 0.11228257417678833, + "learning_rate": 0.00012677134186568148, + "loss": 0.0224, + "step": 2780 + }, + { + "epoch": 0.7342956957352844, + "grad_norm": 0.6469443440437317, + "learning_rate": 0.00012663939833751155, + "loss": 0.0178, + "step": 2785 + }, + { + "epoch": 0.7356140003954914, + "grad_norm": 0.020773250609636307, + "learning_rate": 0.0001265074548093416, + "loss": 0.011, + "step": 2790 + }, + { + "epoch": 0.7369323050556984, + "grad_norm": 0.7378728985786438, + "learning_rate": 0.00012637551128117167, + "loss": 0.0227, + "step": 2795 + }, + { + "epoch": 0.7382506097159054, + "grad_norm": 0.008189595304429531, + "learning_rate": 0.00012624356775300173, + "loss": 0.0892, + "step": 2800 + }, + { + "epoch": 0.7395689143761123, + "grad_norm": 0.031633853912353516, + "learning_rate": 0.00012611162422483178, + "loss": 0.0093, + "step": 2805 + }, + { + "epoch": 0.7408872190363193, + "grad_norm": 0.5078475475311279, + "learning_rate": 0.00012597968069666185, + "loss": 0.0567, + "step": 2810 + }, + { + "epoch": 0.7422055236965263, + "grad_norm": 0.21766887605190277, + "learning_rate": 0.0001258477371684919, + "loss": 0.0485, + "step": 2815 + }, + { + "epoch": 0.7435238283567333, + "grad_norm": 0.3029612898826599, + "learning_rate": 0.00012571579364032196, + "loss": 0.032, + "step": 2820 + }, + { + "epoch": 0.7448421330169402, + "grad_norm": 1.2135159969329834, + "learning_rate": 0.00012558385011215203, + "loss": 0.0139, + "step": 2825 + }, + { + "epoch": 0.7461604376771472, + "grad_norm": 0.016875172033905983, + "learning_rate": 0.00012545190658398207, + "loss": 0.0323, + "step": 2830 + }, + { + "epoch": 0.7474787423373541, + "grad_norm": 0.08923230320215225, + "learning_rate": 0.00012531996305581214, + "loss": 0.0343, + "step": 2835 + }, + { + "epoch": 0.7487970469975611, + "grad_norm": 0.2958766520023346, + "learning_rate": 0.00012518801952764215, + "loss": 0.0431, + "step": 2840 + }, + { + "epoch": 0.7501153516577681, + "grad_norm": 0.7344386577606201, + "learning_rate": 0.00012505607599947222, + "loss": 0.0389, + "step": 2845 + }, + { + "epoch": 0.7514336563179751, + "grad_norm": 0.03681635856628418, + "learning_rate": 0.0001249241324713023, + "loss": 0.0258, + "step": 2850 + }, + { + "epoch": 0.7527519609781821, + "grad_norm": 0.22866861522197723, + "learning_rate": 0.00012479218894313233, + "loss": 0.0223, + "step": 2855 + }, + { + "epoch": 0.7540702656383891, + "grad_norm": 0.029770435765385628, + "learning_rate": 0.0001246602454149624, + "loss": 0.0205, + "step": 2860 + }, + { + "epoch": 0.755388570298596, + "grad_norm": 0.011845707893371582, + "learning_rate": 0.00012452830188679244, + "loss": 0.0252, + "step": 2865 + }, + { + "epoch": 0.756706874958803, + "grad_norm": 0.06696149706840515, + "learning_rate": 0.00012439635835862251, + "loss": 0.0166, + "step": 2870 + }, + { + "epoch": 0.75802517961901, + "grad_norm": 0.01653144136071205, + "learning_rate": 0.00012426441483045256, + "loss": 0.0487, + "step": 2875 + }, + { + "epoch": 0.7593434842792169, + "grad_norm": 0.031312476843595505, + "learning_rate": 0.00012413247130228263, + "loss": 0.0155, + "step": 2880 + }, + { + "epoch": 0.7606617889394239, + "grad_norm": 0.011625733226537704, + "learning_rate": 0.0001240005277741127, + "loss": 0.0333, + "step": 2885 + }, + { + "epoch": 0.7619800935996309, + "grad_norm": 0.012089414522051811, + "learning_rate": 0.00012386858424594274, + "loss": 0.003, + "step": 2890 + }, + { + "epoch": 0.7632983982598378, + "grad_norm": 0.3012307584285736, + "learning_rate": 0.0001237366407177728, + "loss": 0.0172, + "step": 2895 + }, + { + "epoch": 0.7646167029200448, + "grad_norm": 0.31575000286102295, + "learning_rate": 0.00012360469718960285, + "loss": 0.0409, + "step": 2900 + }, + { + "epoch": 0.7659350075802518, + "grad_norm": 0.009794364683330059, + "learning_rate": 0.00012347275366143292, + "loss": 0.0214, + "step": 2905 + }, + { + "epoch": 0.7672533122404588, + "grad_norm": 0.5973085165023804, + "learning_rate": 0.00012334081013326299, + "loss": 0.0245, + "step": 2910 + }, + { + "epoch": 0.7685716169006658, + "grad_norm": 0.019750040024518967, + "learning_rate": 0.00012320886660509303, + "loss": 0.0063, + "step": 2915 + }, + { + "epoch": 0.7698899215608728, + "grad_norm": 0.06402858346700668, + "learning_rate": 0.0001230769230769231, + "loss": 0.0444, + "step": 2920 + }, + { + "epoch": 0.7712082262210797, + "grad_norm": 0.02876671403646469, + "learning_rate": 0.00012294497954875314, + "loss": 0.0103, + "step": 2925 + }, + { + "epoch": 0.7725265308812866, + "grad_norm": 0.6962207555770874, + "learning_rate": 0.0001228130360205832, + "loss": 0.0318, + "step": 2930 + }, + { + "epoch": 0.7738448355414936, + "grad_norm": 0.006536522414535284, + "learning_rate": 0.00012268109249241325, + "loss": 0.0096, + "step": 2935 + }, + { + "epoch": 0.7751631402017006, + "grad_norm": 0.07097168266773224, + "learning_rate": 0.0001225491489642433, + "loss": 0.0174, + "step": 2940 + }, + { + "epoch": 0.7764814448619076, + "grad_norm": 0.042360126972198486, + "learning_rate": 0.00012241720543607336, + "loss": 0.0158, + "step": 2945 + }, + { + "epoch": 0.7777997495221146, + "grad_norm": 0.01159572321921587, + "learning_rate": 0.0001222852619079034, + "loss": 0.0265, + "step": 2950 + }, + { + "epoch": 0.7791180541823215, + "grad_norm": 0.38408163189888, + "learning_rate": 0.00012215331837973347, + "loss": 0.0233, + "step": 2955 + }, + { + "epoch": 0.7804363588425285, + "grad_norm": 0.15588605403900146, + "learning_rate": 0.00012202137485156353, + "loss": 0.0041, + "step": 2960 + }, + { + "epoch": 0.7817546635027355, + "grad_norm": 0.006892362609505653, + "learning_rate": 0.00012188943132339358, + "loss": 0.0026, + "step": 2965 + }, + { + "epoch": 0.7830729681629425, + "grad_norm": 0.030915727838873863, + "learning_rate": 0.00012175748779522364, + "loss": 0.0028, + "step": 2970 + }, + { + "epoch": 0.7843912728231495, + "grad_norm": 0.8151025772094727, + "learning_rate": 0.00012162554426705371, + "loss": 0.0429, + "step": 2975 + }, + { + "epoch": 0.7857095774833565, + "grad_norm": 0.6765475273132324, + "learning_rate": 0.00012149360073888377, + "loss": 0.0319, + "step": 2980 + }, + { + "epoch": 0.7870278821435633, + "grad_norm": 0.054469238966703415, + "learning_rate": 0.00012136165721071382, + "loss": 0.0413, + "step": 2985 + }, + { + "epoch": 0.7883461868037703, + "grad_norm": 0.045610666275024414, + "learning_rate": 0.00012122971368254388, + "loss": 0.0521, + "step": 2990 + }, + { + "epoch": 0.7896644914639773, + "grad_norm": 0.4222470223903656, + "learning_rate": 0.00012109777015437393, + "loss": 0.0846, + "step": 2995 + }, + { + "epoch": 0.7909827961241843, + "grad_norm": 0.0272397268563509, + "learning_rate": 0.00012096582662620399, + "loss": 0.0364, + "step": 3000 + }, + { + "epoch": 0.7909827961241843, + "eval_loss": 0.033312585204839706, + "eval_runtime": 452.2552, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 3.728, + "step": 3000 + }, + { + "epoch": 0.7923011007843913, + "grad_norm": 0.08674059063196182, + "learning_rate": 0.00012083388309803406, + "loss": 0.0081, + "step": 3005 + }, + { + "epoch": 0.7936194054445982, + "grad_norm": 0.21960832178592682, + "learning_rate": 0.00012070193956986411, + "loss": 0.0468, + "step": 3010 + }, + { + "epoch": 0.7949377101048052, + "grad_norm": 0.11259289085865021, + "learning_rate": 0.00012056999604169417, + "loss": 0.0124, + "step": 3015 + }, + { + "epoch": 0.7962560147650122, + "grad_norm": 0.02945362776517868, + "learning_rate": 0.00012043805251352422, + "loss": 0.0298, + "step": 3020 + }, + { + "epoch": 0.7975743194252192, + "grad_norm": 0.27889615297317505, + "learning_rate": 0.00012030610898535428, + "loss": 0.0251, + "step": 3025 + }, + { + "epoch": 0.7988926240854262, + "grad_norm": 0.05873241275548935, + "learning_rate": 0.00012017416545718434, + "loss": 0.0132, + "step": 3030 + }, + { + "epoch": 0.8002109287456332, + "grad_norm": 0.1570046991109848, + "learning_rate": 0.00012004222192901439, + "loss": 0.0228, + "step": 3035 + }, + { + "epoch": 0.80152923340584, + "grad_norm": 0.12575332820415497, + "learning_rate": 0.00011991027840084443, + "loss": 0.0049, + "step": 3040 + }, + { + "epoch": 0.802847538066047, + "grad_norm": 0.8416435122489929, + "learning_rate": 0.00011977833487267449, + "loss": 0.0542, + "step": 3045 + }, + { + "epoch": 0.804165842726254, + "grad_norm": 0.2605098485946655, + "learning_rate": 0.00011964639134450454, + "loss": 0.0084, + "step": 3050 + }, + { + "epoch": 0.805484147386461, + "grad_norm": 0.8996294736862183, + "learning_rate": 0.00011951444781633461, + "loss": 0.0442, + "step": 3055 + }, + { + "epoch": 0.806802452046668, + "grad_norm": 2.7525105476379395, + "learning_rate": 0.00011938250428816467, + "loss": 0.0642, + "step": 3060 + }, + { + "epoch": 0.808120756706875, + "grad_norm": 0.14955930411815643, + "learning_rate": 0.00011925056075999473, + "loss": 0.0384, + "step": 3065 + }, + { + "epoch": 0.8094390613670819, + "grad_norm": 0.018756115809082985, + "learning_rate": 0.00011911861723182478, + "loss": 0.0154, + "step": 3070 + }, + { + "epoch": 0.8107573660272889, + "grad_norm": 0.23998615145683289, + "learning_rate": 0.00011898667370365484, + "loss": 0.0413, + "step": 3075 + }, + { + "epoch": 0.8120756706874959, + "grad_norm": 0.27253249287605286, + "learning_rate": 0.00011885473017548489, + "loss": 0.0081, + "step": 3080 + }, + { + "epoch": 0.8133939753477029, + "grad_norm": 0.2925993502140045, + "learning_rate": 0.00011872278664731495, + "loss": 0.0332, + "step": 3085 + }, + { + "epoch": 0.8147122800079099, + "grad_norm": 0.5364832878112793, + "learning_rate": 0.00011859084311914502, + "loss": 0.0143, + "step": 3090 + }, + { + "epoch": 0.8160305846681168, + "grad_norm": 0.32104921340942383, + "learning_rate": 0.00011845889959097507, + "loss": 0.0216, + "step": 3095 + }, + { + "epoch": 0.8173488893283237, + "grad_norm": 0.0205856766551733, + "learning_rate": 0.00011832695606280513, + "loss": 0.0346, + "step": 3100 + }, + { + "epoch": 0.8186671939885307, + "grad_norm": 0.2541547417640686, + "learning_rate": 0.00011819501253463518, + "loss": 0.0793, + "step": 3105 + }, + { + "epoch": 0.8199854986487377, + "grad_norm": 0.08333491533994675, + "learning_rate": 0.00011806306900646524, + "loss": 0.0049, + "step": 3110 + }, + { + "epoch": 0.8213038033089447, + "grad_norm": 0.0355968177318573, + "learning_rate": 0.0001179311254782953, + "loss": 0.0051, + "step": 3115 + }, + { + "epoch": 0.8226221079691517, + "grad_norm": 0.06948401033878326, + "learning_rate": 0.00011779918195012536, + "loss": 0.013, + "step": 3120 + }, + { + "epoch": 0.8239404126293587, + "grad_norm": 0.03328891843557358, + "learning_rate": 0.00011766723842195542, + "loss": 0.0122, + "step": 3125 + }, + { + "epoch": 0.8252587172895656, + "grad_norm": 0.013782350346446037, + "learning_rate": 0.00011753529489378548, + "loss": 0.0073, + "step": 3130 + }, + { + "epoch": 0.8265770219497726, + "grad_norm": 0.024390392005443573, + "learning_rate": 0.00011740335136561553, + "loss": 0.0143, + "step": 3135 + }, + { + "epoch": 0.8278953266099796, + "grad_norm": 0.002548128366470337, + "learning_rate": 0.00011727140783744557, + "loss": 0.0027, + "step": 3140 + }, + { + "epoch": 0.8292136312701865, + "grad_norm": 0.11674848943948746, + "learning_rate": 0.00011713946430927563, + "loss": 0.0253, + "step": 3145 + }, + { + "epoch": 0.8305319359303935, + "grad_norm": 0.005774884019047022, + "learning_rate": 0.00011700752078110568, + "loss": 0.0018, + "step": 3150 + }, + { + "epoch": 0.8318502405906005, + "grad_norm": 0.5763069987297058, + "learning_rate": 0.00011687557725293574, + "loss": 0.0119, + "step": 3155 + }, + { + "epoch": 0.8331685452508074, + "grad_norm": 0.0027607593219727278, + "learning_rate": 0.0001167436337247658, + "loss": 0.0279, + "step": 3160 + }, + { + "epoch": 0.8344868499110144, + "grad_norm": 1.859642505645752, + "learning_rate": 0.00011661169019659585, + "loss": 0.0228, + "step": 3165 + }, + { + "epoch": 0.8358051545712214, + "grad_norm": 0.16597022116184235, + "learning_rate": 0.00011647974666842592, + "loss": 0.1228, + "step": 3170 + }, + { + "epoch": 0.8371234592314284, + "grad_norm": 0.33833742141723633, + "learning_rate": 0.00011634780314025598, + "loss": 0.073, + "step": 3175 + }, + { + "epoch": 0.8384417638916354, + "grad_norm": 0.024682912975549698, + "learning_rate": 0.00011621585961208603, + "loss": 0.0042, + "step": 3180 + }, + { + "epoch": 0.8397600685518424, + "grad_norm": 0.05926942452788353, + "learning_rate": 0.00011608391608391609, + "loss": 0.0066, + "step": 3185 + }, + { + "epoch": 0.8410783732120493, + "grad_norm": 0.1414029747247696, + "learning_rate": 0.00011595197255574614, + "loss": 0.0603, + "step": 3190 + }, + { + "epoch": 0.8423966778722562, + "grad_norm": 0.37928736209869385, + "learning_rate": 0.0001158200290275762, + "loss": 0.0266, + "step": 3195 + }, + { + "epoch": 0.8437149825324632, + "grad_norm": 0.018329354003071785, + "learning_rate": 0.00011568808549940627, + "loss": 0.0047, + "step": 3200 + }, + { + "epoch": 0.8450332871926702, + "grad_norm": 0.2993735373020172, + "learning_rate": 0.00011555614197123632, + "loss": 0.0218, + "step": 3205 + }, + { + "epoch": 0.8463515918528772, + "grad_norm": 0.1767728328704834, + "learning_rate": 0.00011542419844306638, + "loss": 0.0363, + "step": 3210 + }, + { + "epoch": 0.8476698965130842, + "grad_norm": 0.39774414896965027, + "learning_rate": 0.00011529225491489644, + "loss": 0.0506, + "step": 3215 + }, + { + "epoch": 0.8489882011732911, + "grad_norm": 0.021896762773394585, + "learning_rate": 0.00011516031138672649, + "loss": 0.0081, + "step": 3220 + }, + { + "epoch": 0.8503065058334981, + "grad_norm": 0.358372300863266, + "learning_rate": 0.00011502836785855655, + "loss": 0.0224, + "step": 3225 + }, + { + "epoch": 0.8516248104937051, + "grad_norm": 0.01605542004108429, + "learning_rate": 0.00011489642433038662, + "loss": 0.0215, + "step": 3230 + }, + { + "epoch": 0.8529431151539121, + "grad_norm": 0.021189266815781593, + "learning_rate": 0.00011476448080221667, + "loss": 0.0051, + "step": 3235 + }, + { + "epoch": 0.8542614198141191, + "grad_norm": 0.013394076377153397, + "learning_rate": 0.0001146325372740467, + "loss": 0.021, + "step": 3240 + }, + { + "epoch": 0.8555797244743261, + "grad_norm": 0.19848507642745972, + "learning_rate": 0.00011450059374587676, + "loss": 0.0285, + "step": 3245 + }, + { + "epoch": 0.856898029134533, + "grad_norm": 0.2463046759366989, + "learning_rate": 0.00011436865021770683, + "loss": 0.0384, + "step": 3250 + }, + { + "epoch": 0.8582163337947399, + "grad_norm": 0.37432390451431274, + "learning_rate": 0.00011423670668953688, + "loss": 0.0098, + "step": 3255 + }, + { + "epoch": 0.8595346384549469, + "grad_norm": 0.060943394899368286, + "learning_rate": 0.00011410476316136694, + "loss": 0.0087, + "step": 3260 + }, + { + "epoch": 0.8608529431151539, + "grad_norm": 0.2846696674823761, + "learning_rate": 0.00011397281963319699, + "loss": 0.0148, + "step": 3265 + }, + { + "epoch": 0.8621712477753609, + "grad_norm": 0.009311323054134846, + "learning_rate": 0.00011384087610502705, + "loss": 0.0024, + "step": 3270 + }, + { + "epoch": 0.8634895524355679, + "grad_norm": 0.046277035027742386, + "learning_rate": 0.0001137089325768571, + "loss": 0.0274, + "step": 3275 + }, + { + "epoch": 0.8648078570957748, + "grad_norm": 0.006024620030075312, + "learning_rate": 0.00011357698904868716, + "loss": 0.0286, + "step": 3280 + }, + { + "epoch": 0.8661261617559818, + "grad_norm": 0.033578380942344666, + "learning_rate": 0.00011344504552051723, + "loss": 0.0153, + "step": 3285 + }, + { + "epoch": 0.8674444664161888, + "grad_norm": 0.8537917137145996, + "learning_rate": 0.00011331310199234728, + "loss": 0.0304, + "step": 3290 + }, + { + "epoch": 0.8687627710763958, + "grad_norm": 0.013933337293565273, + "learning_rate": 0.00011318115846417734, + "loss": 0.0112, + "step": 3295 + }, + { + "epoch": 0.8700810757366028, + "grad_norm": 0.35437721014022827, + "learning_rate": 0.0001130492149360074, + "loss": 0.0228, + "step": 3300 + }, + { + "epoch": 0.8713993803968098, + "grad_norm": 1.3024121522903442, + "learning_rate": 0.00011291727140783745, + "loss": 0.0203, + "step": 3305 + }, + { + "epoch": 0.8727176850570166, + "grad_norm": 0.5131255984306335, + "learning_rate": 0.00011278532787966751, + "loss": 0.0181, + "step": 3310 + }, + { + "epoch": 0.8740359897172236, + "grad_norm": 0.039366886019706726, + "learning_rate": 0.00011265338435149758, + "loss": 0.0192, + "step": 3315 + }, + { + "epoch": 0.8753542943774306, + "grad_norm": 0.13679669797420502, + "learning_rate": 0.00011252144082332763, + "loss": 0.004, + "step": 3320 + }, + { + "epoch": 0.8766725990376376, + "grad_norm": 0.003076886525377631, + "learning_rate": 0.00011238949729515769, + "loss": 0.0405, + "step": 3325 + }, + { + "epoch": 0.8779909036978446, + "grad_norm": 0.019953785464167595, + "learning_rate": 0.00011225755376698774, + "loss": 0.0241, + "step": 3330 + }, + { + "epoch": 0.8793092083580516, + "grad_norm": 0.007980377413332462, + "learning_rate": 0.0001121256102388178, + "loss": 0.0064, + "step": 3335 + }, + { + "epoch": 0.8806275130182585, + "grad_norm": 0.018761295825242996, + "learning_rate": 0.00011199366671064784, + "loss": 0.0032, + "step": 3340 + }, + { + "epoch": 0.8819458176784655, + "grad_norm": 0.022511709481477737, + "learning_rate": 0.0001118617231824779, + "loss": 0.0055, + "step": 3345 + }, + { + "epoch": 0.8832641223386725, + "grad_norm": 0.021270718425512314, + "learning_rate": 0.00011172977965430795, + "loss": 0.033, + "step": 3350 + }, + { + "epoch": 0.8845824269988795, + "grad_norm": 0.02710561640560627, + "learning_rate": 0.00011159783612613801, + "loss": 0.0094, + "step": 3355 + }, + { + "epoch": 0.8859007316590864, + "grad_norm": 0.4353378117084503, + "learning_rate": 0.00011146589259796806, + "loss": 0.0089, + "step": 3360 + }, + { + "epoch": 0.8872190363192934, + "grad_norm": 0.0257766991853714, + "learning_rate": 0.00011133394906979813, + "loss": 0.0059, + "step": 3365 + }, + { + "epoch": 0.8885373409795003, + "grad_norm": 0.80838942527771, + "learning_rate": 0.00011120200554162819, + "loss": 0.0263, + "step": 3370 + }, + { + "epoch": 0.8898556456397073, + "grad_norm": 0.007799761835485697, + "learning_rate": 0.00011107006201345824, + "loss": 0.0028, + "step": 3375 + }, + { + "epoch": 0.8911739502999143, + "grad_norm": 0.007315775845199823, + "learning_rate": 0.0001109381184852883, + "loss": 0.0127, + "step": 3380 + }, + { + "epoch": 0.8924922549601213, + "grad_norm": 1.4861233234405518, + "learning_rate": 0.00011080617495711836, + "loss": 0.0562, + "step": 3385 + }, + { + "epoch": 0.8938105596203283, + "grad_norm": 0.010219530202448368, + "learning_rate": 0.00011067423142894841, + "loss": 0.0438, + "step": 3390 + }, + { + "epoch": 0.8951288642805353, + "grad_norm": 1.0191857814788818, + "learning_rate": 0.00011054228790077848, + "loss": 0.0493, + "step": 3395 + }, + { + "epoch": 0.8964471689407422, + "grad_norm": 0.01459536887705326, + "learning_rate": 0.00011041034437260854, + "loss": 0.0117, + "step": 3400 + }, + { + "epoch": 0.8977654736009492, + "grad_norm": 0.008682495914399624, + "learning_rate": 0.00011027840084443859, + "loss": 0.02, + "step": 3405 + }, + { + "epoch": 0.8990837782611562, + "grad_norm": 0.02197263017296791, + "learning_rate": 0.00011014645731626865, + "loss": 0.0454, + "step": 3410 + }, + { + "epoch": 0.9004020829213631, + "grad_norm": 0.01436714269220829, + "learning_rate": 0.0001100145137880987, + "loss": 0.0283, + "step": 3415 + }, + { + "epoch": 0.9017203875815701, + "grad_norm": 0.14327946305274963, + "learning_rate": 0.00010988257025992876, + "loss": 0.0461, + "step": 3420 + }, + { + "epoch": 0.9030386922417771, + "grad_norm": 1.671773910522461, + "learning_rate": 0.00010975062673175883, + "loss": 0.054, + "step": 3425 + }, + { + "epoch": 0.904356996901984, + "grad_norm": 0.009926804341375828, + "learning_rate": 0.00010961868320358888, + "loss": 0.0429, + "step": 3430 + }, + { + "epoch": 0.905675301562191, + "grad_norm": 0.554020881652832, + "learning_rate": 0.00010948673967541894, + "loss": 0.0618, + "step": 3435 + }, + { + "epoch": 0.906993606222398, + "grad_norm": 0.1399248093366623, + "learning_rate": 0.00010935479614724897, + "loss": 0.0229, + "step": 3440 + }, + { + "epoch": 0.908311910882605, + "grad_norm": 0.02739197015762329, + "learning_rate": 0.00010922285261907904, + "loss": 0.0082, + "step": 3445 + }, + { + "epoch": 0.909630215542812, + "grad_norm": 0.33394527435302734, + "learning_rate": 0.00010909090909090909, + "loss": 0.0403, + "step": 3450 + }, + { + "epoch": 0.9109485202030189, + "grad_norm": 0.08083894103765488, + "learning_rate": 0.00010895896556273915, + "loss": 0.0406, + "step": 3455 + }, + { + "epoch": 0.9122668248632259, + "grad_norm": 0.39336663484573364, + "learning_rate": 0.0001088270220345692, + "loss": 0.02, + "step": 3460 + }, + { + "epoch": 0.9135851295234328, + "grad_norm": 0.20481553673744202, + "learning_rate": 0.00010869507850639926, + "loss": 0.0221, + "step": 3465 + }, + { + "epoch": 0.9149034341836398, + "grad_norm": 1.4507408142089844, + "learning_rate": 0.00010856313497822932, + "loss": 0.0357, + "step": 3470 + }, + { + "epoch": 0.9162217388438468, + "grad_norm": 0.2678806483745575, + "learning_rate": 0.00010843119145005937, + "loss": 0.0181, + "step": 3475 + }, + { + "epoch": 0.9175400435040538, + "grad_norm": 0.007361674215644598, + "learning_rate": 0.00010829924792188944, + "loss": 0.0978, + "step": 3480 + }, + { + "epoch": 0.9188583481642607, + "grad_norm": 0.773695707321167, + "learning_rate": 0.0001081673043937195, + "loss": 0.0401, + "step": 3485 + }, + { + "epoch": 0.9201766528244677, + "grad_norm": 0.0010772625682875514, + "learning_rate": 0.00010803536086554955, + "loss": 0.0233, + "step": 3490 + }, + { + "epoch": 0.9214949574846747, + "grad_norm": 0.08971104770898819, + "learning_rate": 0.00010790341733737961, + "loss": 0.0319, + "step": 3495 + }, + { + "epoch": 0.9228132621448817, + "grad_norm": 0.21372731029987335, + "learning_rate": 0.00010777147380920966, + "loss": 0.0315, + "step": 3500 + }, + { + "epoch": 0.9228132621448817, + "eval_loss": 0.02952708676457405, + "eval_runtime": 451.5837, + "eval_samples_per_second": 7.467, + "eval_steps_per_second": 3.734, + "step": 3500 + }, + { + "epoch": 0.9241315668050887, + "grad_norm": 0.016639264300465584, + "learning_rate": 0.00010763953028103972, + "loss": 0.0125, + "step": 3505 + }, + { + "epoch": 0.9254498714652957, + "grad_norm": 0.46340492367744446, + "learning_rate": 0.00010750758675286979, + "loss": 0.0186, + "step": 3510 + }, + { + "epoch": 0.9267681761255026, + "grad_norm": 0.01847526989877224, + "learning_rate": 0.00010737564322469984, + "loss": 0.0026, + "step": 3515 + }, + { + "epoch": 0.9280864807857095, + "grad_norm": 0.5947860479354858, + "learning_rate": 0.0001072436996965299, + "loss": 0.0259, + "step": 3520 + }, + { + "epoch": 0.9294047854459165, + "grad_norm": 0.06145291402935982, + "learning_rate": 0.00010711175616835995, + "loss": 0.0057, + "step": 3525 + }, + { + "epoch": 0.9307230901061235, + "grad_norm": 0.0143959429115057, + "learning_rate": 0.00010697981264019001, + "loss": 0.0145, + "step": 3530 + }, + { + "epoch": 0.9320413947663305, + "grad_norm": 0.21143831312656403, + "learning_rate": 0.00010684786911202007, + "loss": 0.0459, + "step": 3535 + }, + { + "epoch": 0.9333596994265375, + "grad_norm": 0.02548077143728733, + "learning_rate": 0.00010671592558385011, + "loss": 0.0051, + "step": 3540 + }, + { + "epoch": 0.9346780040867444, + "grad_norm": 0.008077048696577549, + "learning_rate": 0.00010658398205568016, + "loss": 0.0306, + "step": 3545 + }, + { + "epoch": 0.9359963087469514, + "grad_norm": 0.0030760422814637423, + "learning_rate": 0.00010645203852751022, + "loss": 0.0575, + "step": 3550 + }, + { + "epoch": 0.9373146134071584, + "grad_norm": 0.18114158511161804, + "learning_rate": 0.00010632009499934027, + "loss": 0.0885, + "step": 3555 + }, + { + "epoch": 0.9386329180673654, + "grad_norm": 0.02450549602508545, + "learning_rate": 0.00010618815147117034, + "loss": 0.0045, + "step": 3560 + }, + { + "epoch": 0.9399512227275724, + "grad_norm": 0.1238626018166542, + "learning_rate": 0.0001060562079430004, + "loss": 0.0166, + "step": 3565 + }, + { + "epoch": 0.9412695273877794, + "grad_norm": 0.1879919469356537, + "learning_rate": 0.00010592426441483046, + "loss": 0.0077, + "step": 3570 + }, + { + "epoch": 0.9425878320479862, + "grad_norm": 0.11323565989732742, + "learning_rate": 0.00010579232088666051, + "loss": 0.0213, + "step": 3575 + }, + { + "epoch": 0.9439061367081932, + "grad_norm": 0.35575854778289795, + "learning_rate": 0.00010566037735849057, + "loss": 0.0336, + "step": 3580 + }, + { + "epoch": 0.9452244413684002, + "grad_norm": 0.14052227139472961, + "learning_rate": 0.00010552843383032062, + "loss": 0.0325, + "step": 3585 + }, + { + "epoch": 0.9465427460286072, + "grad_norm": 0.2643798887729645, + "learning_rate": 0.00010539649030215069, + "loss": 0.0192, + "step": 3590 + }, + { + "epoch": 0.9478610506888142, + "grad_norm": 0.3207031190395355, + "learning_rate": 0.00010526454677398075, + "loss": 0.0221, + "step": 3595 + }, + { + "epoch": 0.9491793553490212, + "grad_norm": 0.022803861647844315, + "learning_rate": 0.0001051326032458108, + "loss": 0.029, + "step": 3600 + }, + { + "epoch": 0.9504976600092281, + "grad_norm": 0.02511664852499962, + "learning_rate": 0.00010500065971764086, + "loss": 0.0422, + "step": 3605 + }, + { + "epoch": 0.9518159646694351, + "grad_norm": 0.06505445390939713, + "learning_rate": 0.00010486871618947091, + "loss": 0.0092, + "step": 3610 + }, + { + "epoch": 0.9531342693296421, + "grad_norm": 0.09998584538698196, + "learning_rate": 0.00010473677266130097, + "loss": 0.0242, + "step": 3615 + }, + { + "epoch": 0.9544525739898491, + "grad_norm": 0.9645698666572571, + "learning_rate": 0.00010460482913313104, + "loss": 0.0124, + "step": 3620 + }, + { + "epoch": 0.955770878650056, + "grad_norm": 0.2389964610338211, + "learning_rate": 0.0001044728856049611, + "loss": 0.0169, + "step": 3625 + }, + { + "epoch": 0.957089183310263, + "grad_norm": 2.030608654022217, + "learning_rate": 0.00010434094207679115, + "loss": 0.0518, + "step": 3630 + }, + { + "epoch": 0.9584074879704699, + "grad_norm": 0.05979987606406212, + "learning_rate": 0.0001042089985486212, + "loss": 0.0081, + "step": 3635 + }, + { + "epoch": 0.9597257926306769, + "grad_norm": 0.15761719644069672, + "learning_rate": 0.00010407705502045125, + "loss": 0.0061, + "step": 3640 + }, + { + "epoch": 0.9610440972908839, + "grad_norm": 0.6534290909767151, + "learning_rate": 0.0001039451114922813, + "loss": 0.0104, + "step": 3645 + }, + { + "epoch": 0.9623624019510909, + "grad_norm": 1.0324147939682007, + "learning_rate": 0.00010381316796411136, + "loss": 0.0381, + "step": 3650 + }, + { + "epoch": 0.9636807066112979, + "grad_norm": 0.002968872431665659, + "learning_rate": 0.00010368122443594142, + "loss": 0.0343, + "step": 3655 + }, + { + "epoch": 0.9649990112715049, + "grad_norm": 0.011243184097111225, + "learning_rate": 0.00010354928090777147, + "loss": 0.019, + "step": 3660 + }, + { + "epoch": 0.9663173159317118, + "grad_norm": 0.17663739621639252, + "learning_rate": 0.00010341733737960153, + "loss": 0.0452, + "step": 3665 + }, + { + "epoch": 0.9676356205919188, + "grad_norm": 1.2647719383239746, + "learning_rate": 0.00010328539385143158, + "loss": 0.0154, + "step": 3670 + }, + { + "epoch": 0.9689539252521258, + "grad_norm": 0.3691752552986145, + "learning_rate": 0.00010315345032326165, + "loss": 0.028, + "step": 3675 + }, + { + "epoch": 0.9702722299123328, + "grad_norm": 0.0015879774000495672, + "learning_rate": 0.00010302150679509171, + "loss": 0.0202, + "step": 3680 + }, + { + "epoch": 0.9715905345725397, + "grad_norm": 0.1441984623670578, + "learning_rate": 0.00010288956326692176, + "loss": 0.0221, + "step": 3685 + }, + { + "epoch": 0.9729088392327467, + "grad_norm": 0.20431455969810486, + "learning_rate": 0.00010275761973875182, + "loss": 0.0072, + "step": 3690 + }, + { + "epoch": 0.9742271438929536, + "grad_norm": 0.861625611782074, + "learning_rate": 0.00010262567621058187, + "loss": 0.0523, + "step": 3695 + }, + { + "epoch": 0.9755454485531606, + "grad_norm": 0.005049478262662888, + "learning_rate": 0.00010249373268241193, + "loss": 0.0051, + "step": 3700 + }, + { + "epoch": 0.9768637532133676, + "grad_norm": 0.49685510993003845, + "learning_rate": 0.000102361789154242, + "loss": 0.023, + "step": 3705 + }, + { + "epoch": 0.9781820578735746, + "grad_norm": 0.08789395540952682, + "learning_rate": 0.00010222984562607205, + "loss": 0.0159, + "step": 3710 + }, + { + "epoch": 0.9795003625337816, + "grad_norm": 0.027168691158294678, + "learning_rate": 0.00010209790209790211, + "loss": 0.0083, + "step": 3715 + }, + { + "epoch": 0.9808186671939886, + "grad_norm": 0.0006773864733986557, + "learning_rate": 0.00010196595856973217, + "loss": 0.0048, + "step": 3720 + }, + { + "epoch": 0.9821369718541955, + "grad_norm": 0.01636457070708275, + "learning_rate": 0.00010183401504156222, + "loss": 0.0159, + "step": 3725 + }, + { + "epoch": 0.9834552765144025, + "grad_norm": 0.10160859674215317, + "learning_rate": 0.00010170207151339228, + "loss": 0.0047, + "step": 3730 + }, + { + "epoch": 0.9847735811746094, + "grad_norm": 0.14173269271850586, + "learning_rate": 0.00010157012798522232, + "loss": 0.006, + "step": 3735 + }, + { + "epoch": 0.9860918858348164, + "grad_norm": 0.003458512481302023, + "learning_rate": 0.00010143818445705238, + "loss": 0.0193, + "step": 3740 + }, + { + "epoch": 0.9874101904950234, + "grad_norm": 0.005163820460438728, + "learning_rate": 0.00010130624092888243, + "loss": 0.0039, + "step": 3745 + }, + { + "epoch": 0.9887284951552304, + "grad_norm": 0.005913791712373495, + "learning_rate": 0.00010117429740071249, + "loss": 0.0119, + "step": 3750 + }, + { + "epoch": 0.9900467998154373, + "grad_norm": 0.00800853967666626, + "learning_rate": 0.00010104235387254256, + "loss": 0.044, + "step": 3755 + }, + { + "epoch": 0.9913651044756443, + "grad_norm": 0.18146778643131256, + "learning_rate": 0.00010091041034437261, + "loss": 0.0048, + "step": 3760 + }, + { + "epoch": 0.9926834091358513, + "grad_norm": 0.01235104724764824, + "learning_rate": 0.00010077846681620267, + "loss": 0.0017, + "step": 3765 + }, + { + "epoch": 0.9940017137960583, + "grad_norm": 0.17677897214889526, + "learning_rate": 0.00010064652328803272, + "loss": 0.0339, + "step": 3770 + }, + { + "epoch": 0.9953200184562653, + "grad_norm": 0.0017472271574661136, + "learning_rate": 0.00010051457975986278, + "loss": 0.0494, + "step": 3775 + }, + { + "epoch": 0.9966383231164723, + "grad_norm": 0.10814860463142395, + "learning_rate": 0.00010038263623169283, + "loss": 0.0741, + "step": 3780 + }, + { + "epoch": 0.9979566277766792, + "grad_norm": 0.11329760402441025, + "learning_rate": 0.0001002506927035229, + "loss": 0.0182, + "step": 3785 + }, + { + "epoch": 0.9992749324368861, + "grad_norm": 0.11573276668787003, + "learning_rate": 0.00010011874917535296, + "loss": 0.0068, + "step": 3790 + }, + { + "epoch": 1.000790982796124, + "grad_norm": 0.08449886739253998, + "learning_rate": 9.998680564718301e-05, + "loss": 0.0141, + "step": 3795 + }, + { + "epoch": 1.002109287456331, + "grad_norm": 0.05035184696316719, + "learning_rate": 9.985486211901307e-05, + "loss": 0.0293, + "step": 3800 + }, + { + "epoch": 1.003427592116538, + "grad_norm": 0.0255444198846817, + "learning_rate": 9.972291859084313e-05, + "loss": 0.0054, + "step": 3805 + }, + { + "epoch": 1.004745896776745, + "grad_norm": 0.0033677336759865284, + "learning_rate": 9.959097506267318e-05, + "loss": 0.0567, + "step": 3810 + }, + { + "epoch": 1.006064201436952, + "grad_norm": 0.09453682601451874, + "learning_rate": 9.945903153450324e-05, + "loss": 0.0589, + "step": 3815 + }, + { + "epoch": 1.007382506097159, + "grad_norm": 0.01592979207634926, + "learning_rate": 9.932708800633329e-05, + "loss": 0.0043, + "step": 3820 + }, + { + "epoch": 1.008700810757366, + "grad_norm": 0.002263693604618311, + "learning_rate": 9.919514447816335e-05, + "loss": 0.0195, + "step": 3825 + }, + { + "epoch": 1.010019115417573, + "grad_norm": 0.013390793465077877, + "learning_rate": 9.90632009499934e-05, + "loss": 0.0152, + "step": 3830 + }, + { + "epoch": 1.01133742007778, + "grad_norm": 0.10473847389221191, + "learning_rate": 9.893125742182346e-05, + "loss": 0.0606, + "step": 3835 + }, + { + "epoch": 1.012655724737987, + "grad_norm": 0.05837221071124077, + "learning_rate": 9.879931389365353e-05, + "loss": 0.0121, + "step": 3840 + }, + { + "epoch": 1.013974029398194, + "grad_norm": 0.3803791105747223, + "learning_rate": 9.866737036548358e-05, + "loss": 0.0386, + "step": 3845 + }, + { + "epoch": 1.0152923340584008, + "grad_norm": 0.4067519009113312, + "learning_rate": 9.853542683731364e-05, + "loss": 0.0115, + "step": 3850 + }, + { + "epoch": 1.0166106387186078, + "grad_norm": 0.02585229091346264, + "learning_rate": 9.84034833091437e-05, + "loss": 0.0214, + "step": 3855 + }, + { + "epoch": 1.0179289433788148, + "grad_norm": 0.03670825809240341, + "learning_rate": 9.827153978097374e-05, + "loss": 0.0059, + "step": 3860 + }, + { + "epoch": 1.0192472480390218, + "grad_norm": 0.014171554706990719, + "learning_rate": 9.81395962528038e-05, + "loss": 0.0145, + "step": 3865 + }, + { + "epoch": 1.0205655526992288, + "grad_norm": 0.027376385405659676, + "learning_rate": 9.800765272463386e-05, + "loss": 0.0089, + "step": 3870 + }, + { + "epoch": 1.0218838573594358, + "grad_norm": 0.03168405964970589, + "learning_rate": 9.787570919646392e-05, + "loss": 0.0132, + "step": 3875 + }, + { + "epoch": 1.0232021620196428, + "grad_norm": 0.03346199914813042, + "learning_rate": 9.774376566829397e-05, + "loss": 0.0246, + "step": 3880 + }, + { + "epoch": 1.0245204666798498, + "grad_norm": 0.00894144270569086, + "learning_rate": 9.761182214012403e-05, + "loss": 0.0105, + "step": 3885 + }, + { + "epoch": 1.0258387713400567, + "grad_norm": 0.3172806203365326, + "learning_rate": 9.747987861195409e-05, + "loss": 0.0103, + "step": 3890 + }, + { + "epoch": 1.0271570760002637, + "grad_norm": 0.009055040776729584, + "learning_rate": 9.734793508378414e-05, + "loss": 0.0103, + "step": 3895 + }, + { + "epoch": 1.0284753806604707, + "grad_norm": 0.014140011742711067, + "learning_rate": 9.721599155561421e-05, + "loss": 0.0037, + "step": 3900 + }, + { + "epoch": 1.0297936853206777, + "grad_norm": 0.008317383006215096, + "learning_rate": 9.708404802744427e-05, + "loss": 0.002, + "step": 3905 + }, + { + "epoch": 1.0311119899808845, + "grad_norm": 0.005038558971136808, + "learning_rate": 9.695210449927431e-05, + "loss": 0.0017, + "step": 3910 + }, + { + "epoch": 1.0324302946410915, + "grad_norm": 0.40058520436286926, + "learning_rate": 9.682016097110436e-05, + "loss": 0.0065, + "step": 3915 + }, + { + "epoch": 1.0337485993012985, + "grad_norm": 0.005197151098400354, + "learning_rate": 9.668821744293442e-05, + "loss": 0.0031, + "step": 3920 + }, + { + "epoch": 1.0350669039615055, + "grad_norm": 0.014353781007230282, + "learning_rate": 9.655627391476449e-05, + "loss": 0.0009, + "step": 3925 + }, + { + "epoch": 1.0363852086217125, + "grad_norm": 0.13260559737682343, + "learning_rate": 9.642433038659454e-05, + "loss": 0.0323, + "step": 3930 + }, + { + "epoch": 1.0377035132819195, + "grad_norm": 0.006795065477490425, + "learning_rate": 9.62923868584246e-05, + "loss": 0.0022, + "step": 3935 + }, + { + "epoch": 1.0390218179421264, + "grad_norm": 0.2276086062192917, + "learning_rate": 9.616044333025466e-05, + "loss": 0.0221, + "step": 3940 + }, + { + "epoch": 1.0403401226023334, + "grad_norm": 0.06121920794248581, + "learning_rate": 9.602849980208471e-05, + "loss": 0.0037, + "step": 3945 + }, + { + "epoch": 1.0416584272625404, + "grad_norm": 0.9180755019187927, + "learning_rate": 9.589655627391477e-05, + "loss": 0.0589, + "step": 3950 + }, + { + "epoch": 1.0429767319227474, + "grad_norm": 0.07515591382980347, + "learning_rate": 9.576461274574484e-05, + "loss": 0.0653, + "step": 3955 + }, + { + "epoch": 1.0442950365829544, + "grad_norm": 0.018060607835650444, + "learning_rate": 9.563266921757488e-05, + "loss": 0.0178, + "step": 3960 + }, + { + "epoch": 1.0456133412431612, + "grad_norm": 0.02751368284225464, + "learning_rate": 9.550072568940493e-05, + "loss": 0.0076, + "step": 3965 + }, + { + "epoch": 1.0469316459033682, + "grad_norm": 0.653998613357544, + "learning_rate": 9.536878216123499e-05, + "loss": 0.0066, + "step": 3970 + }, + { + "epoch": 1.0482499505635752, + "grad_norm": 0.3117768168449402, + "learning_rate": 9.523683863306505e-05, + "loss": 0.0087, + "step": 3975 + }, + { + "epoch": 1.0495682552237822, + "grad_norm": 0.013952831737697124, + "learning_rate": 9.510489510489511e-05, + "loss": 0.0037, + "step": 3980 + }, + { + "epoch": 1.0508865598839892, + "grad_norm": 0.01806250400841236, + "learning_rate": 9.497295157672517e-05, + "loss": 0.0028, + "step": 3985 + }, + { + "epoch": 1.0522048645441962, + "grad_norm": 0.13678006827831268, + "learning_rate": 9.484100804855523e-05, + "loss": 0.0533, + "step": 3990 + }, + { + "epoch": 1.0535231692044031, + "grad_norm": 0.14869382977485657, + "learning_rate": 9.470906452038528e-05, + "loss": 0.009, + "step": 3995 + }, + { + "epoch": 1.0548414738646101, + "grad_norm": 0.33614659309387207, + "learning_rate": 9.457712099221534e-05, + "loss": 0.0555, + "step": 4000 + }, + { + "epoch": 1.0548414738646101, + "eval_loss": 0.026165226474404335, + "eval_runtime": 452.2482, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 3.728, + "step": 4000 + }, + { + "epoch": 1.0561597785248171, + "grad_norm": 0.007546027656644583, + "learning_rate": 9.444517746404539e-05, + "loss": 0.0029, + "step": 4005 + }, + { + "epoch": 1.0574780831850241, + "grad_norm": 0.3720332384109497, + "learning_rate": 9.431323393587545e-05, + "loss": 0.0353, + "step": 4010 + }, + { + "epoch": 1.0587963878452311, + "grad_norm": 1.1335264444351196, + "learning_rate": 9.41812904077055e-05, + "loss": 0.0142, + "step": 4015 + }, + { + "epoch": 1.060114692505438, + "grad_norm": 0.024723488837480545, + "learning_rate": 9.404934687953556e-05, + "loss": 0.006, + "step": 4020 + }, + { + "epoch": 1.0614329971656449, + "grad_norm": 0.040354058146476746, + "learning_rate": 9.391740335136562e-05, + "loss": 0.0107, + "step": 4025 + }, + { + "epoch": 1.0627513018258519, + "grad_norm": 0.222810298204422, + "learning_rate": 9.378545982319567e-05, + "loss": 0.0273, + "step": 4030 + }, + { + "epoch": 1.0640696064860589, + "grad_norm": 0.025684095919132233, + "learning_rate": 9.365351629502574e-05, + "loss": 0.0033, + "step": 4035 + }, + { + "epoch": 1.0653879111462659, + "grad_norm": 0.05338352546095848, + "learning_rate": 9.35215727668558e-05, + "loss": 0.0052, + "step": 4040 + }, + { + "epoch": 1.0667062158064728, + "grad_norm": 0.06182330474257469, + "learning_rate": 9.338962923868585e-05, + "loss": 0.0038, + "step": 4045 + }, + { + "epoch": 1.0680245204666798, + "grad_norm": 0.012170832604169846, + "learning_rate": 9.325768571051591e-05, + "loss": 0.0018, + "step": 4050 + }, + { + "epoch": 1.0693428251268868, + "grad_norm": 0.5424306392669678, + "learning_rate": 9.312574218234596e-05, + "loss": 0.0445, + "step": 4055 + }, + { + "epoch": 1.0706611297870938, + "grad_norm": 0.017939254641532898, + "learning_rate": 9.299379865417602e-05, + "loss": 0.0389, + "step": 4060 + }, + { + "epoch": 1.0719794344473008, + "grad_norm": 0.0060431682504713535, + "learning_rate": 9.286185512600607e-05, + "loss": 0.0025, + "step": 4065 + }, + { + "epoch": 1.0732977391075078, + "grad_norm": 0.0071444883942604065, + "learning_rate": 9.272991159783613e-05, + "loss": 0.0333, + "step": 4070 + }, + { + "epoch": 1.0746160437677148, + "grad_norm": 0.29632750153541565, + "learning_rate": 9.259796806966619e-05, + "loss": 0.0151, + "step": 4075 + }, + { + "epoch": 1.0759343484279218, + "grad_norm": 0.004526323173195124, + "learning_rate": 9.246602454149624e-05, + "loss": 0.006, + "step": 4080 + }, + { + "epoch": 1.0772526530881286, + "grad_norm": 0.023945212364196777, + "learning_rate": 9.23340810133263e-05, + "loss": 0.004, + "step": 4085 + }, + { + "epoch": 1.0785709577483356, + "grad_norm": 0.13235126435756683, + "learning_rate": 9.220213748515635e-05, + "loss": 0.0059, + "step": 4090 + }, + { + "epoch": 1.0798892624085425, + "grad_norm": 0.17592330276966095, + "learning_rate": 9.207019395698642e-05, + "loss": 0.0302, + "step": 4095 + }, + { + "epoch": 1.0812075670687495, + "grad_norm": 0.004582866560667753, + "learning_rate": 9.193825042881648e-05, + "loss": 0.009, + "step": 4100 + }, + { + "epoch": 1.0825258717289565, + "grad_norm": 0.15214525163173676, + "learning_rate": 9.180630690064653e-05, + "loss": 0.0062, + "step": 4105 + }, + { + "epoch": 1.0838441763891635, + "grad_norm": 0.16535983979701996, + "learning_rate": 9.167436337247658e-05, + "loss": 0.0926, + "step": 4110 + }, + { + "epoch": 1.0851624810493705, + "grad_norm": 0.013285227119922638, + "learning_rate": 9.154241984430663e-05, + "loss": 0.0043, + "step": 4115 + }, + { + "epoch": 1.0864807857095775, + "grad_norm": 0.012116984464228153, + "learning_rate": 9.14104763161367e-05, + "loss": 0.0037, + "step": 4120 + }, + { + "epoch": 1.0877990903697845, + "grad_norm": 0.0373845212161541, + "learning_rate": 9.127853278796676e-05, + "loss": 0.0081, + "step": 4125 + }, + { + "epoch": 1.0891173950299915, + "grad_norm": 0.09324615448713303, + "learning_rate": 9.114658925979681e-05, + "loss": 0.0534, + "step": 4130 + }, + { + "epoch": 1.0904356996901985, + "grad_norm": 0.010992968454957008, + "learning_rate": 9.101464573162687e-05, + "loss": 0.0025, + "step": 4135 + }, + { + "epoch": 1.0917540043504055, + "grad_norm": 0.13710318505764008, + "learning_rate": 9.088270220345692e-05, + "loss": 0.0555, + "step": 4140 + }, + { + "epoch": 1.0930723090106123, + "grad_norm": 0.010403074324131012, + "learning_rate": 9.075075867528698e-05, + "loss": 0.0042, + "step": 4145 + }, + { + "epoch": 1.0943906136708192, + "grad_norm": 0.21544460952281952, + "learning_rate": 9.061881514711705e-05, + "loss": 0.0144, + "step": 4150 + }, + { + "epoch": 1.0957089183310262, + "grad_norm": 0.04194799065589905, + "learning_rate": 9.04868716189471e-05, + "loss": 0.0106, + "step": 4155 + }, + { + "epoch": 1.0970272229912332, + "grad_norm": 0.029204202815890312, + "learning_rate": 9.035492809077715e-05, + "loss": 0.0085, + "step": 4160 + }, + { + "epoch": 1.0983455276514402, + "grad_norm": 0.006751026958227158, + "learning_rate": 9.02229845626072e-05, + "loss": 0.0049, + "step": 4165 + }, + { + "epoch": 1.0996638323116472, + "grad_norm": 0.008232722990214825, + "learning_rate": 9.009104103443726e-05, + "loss": 0.0172, + "step": 4170 + }, + { + "epoch": 1.1009821369718542, + "grad_norm": 0.05630079656839371, + "learning_rate": 8.995909750626733e-05, + "loss": 0.0112, + "step": 4175 + }, + { + "epoch": 1.1023004416320612, + "grad_norm": 0.0011601662263274193, + "learning_rate": 8.982715397809738e-05, + "loss": 0.0317, + "step": 4180 + }, + { + "epoch": 1.1036187462922682, + "grad_norm": 0.006554402410984039, + "learning_rate": 8.969521044992744e-05, + "loss": 0.0035, + "step": 4185 + }, + { + "epoch": 1.1049370509524752, + "grad_norm": 0.34513652324676514, + "learning_rate": 8.956326692175749e-05, + "loss": 0.0036, + "step": 4190 + }, + { + "epoch": 1.1062553556126822, + "grad_norm": 0.283669650554657, + "learning_rate": 8.943132339358755e-05, + "loss": 0.0182, + "step": 4195 + }, + { + "epoch": 1.1075736602728892, + "grad_norm": 0.5376952290534973, + "learning_rate": 8.92993798654176e-05, + "loss": 0.0293, + "step": 4200 + }, + { + "epoch": 1.108891964933096, + "grad_norm": 0.01689724065363407, + "learning_rate": 8.916743633724767e-05, + "loss": 0.0206, + "step": 4205 + }, + { + "epoch": 1.110210269593303, + "grad_norm": 0.026538770645856857, + "learning_rate": 8.903549280907772e-05, + "loss": 0.0181, + "step": 4210 + }, + { + "epoch": 1.11152857425351, + "grad_norm": 0.6372873783111572, + "learning_rate": 8.890354928090777e-05, + "loss": 0.021, + "step": 4215 + }, + { + "epoch": 1.112846878913717, + "grad_norm": 0.06177428737282753, + "learning_rate": 8.877160575273783e-05, + "loss": 0.0033, + "step": 4220 + }, + { + "epoch": 1.114165183573924, + "grad_norm": 0.3712109923362732, + "learning_rate": 8.863966222456788e-05, + "loss": 0.0075, + "step": 4225 + }, + { + "epoch": 1.115483488234131, + "grad_norm": 0.030514653772115707, + "learning_rate": 8.850771869639795e-05, + "loss": 0.0183, + "step": 4230 + }, + { + "epoch": 1.116801792894338, + "grad_norm": 0.012861707247793674, + "learning_rate": 8.837577516822801e-05, + "loss": 0.0032, + "step": 4235 + }, + { + "epoch": 1.118120097554545, + "grad_norm": 0.3278522789478302, + "learning_rate": 8.824383164005806e-05, + "loss": 0.0058, + "step": 4240 + }, + { + "epoch": 1.1194384022147519, + "grad_norm": 0.580259382724762, + "learning_rate": 8.811188811188812e-05, + "loss": 0.0068, + "step": 4245 + }, + { + "epoch": 1.1207567068749589, + "grad_norm": 0.007002575788646936, + "learning_rate": 8.797994458371817e-05, + "loss": 0.0063, + "step": 4250 + }, + { + "epoch": 1.1220750115351659, + "grad_norm": 0.22484643757343292, + "learning_rate": 8.784800105554823e-05, + "loss": 0.0167, + "step": 4255 + }, + { + "epoch": 1.1233933161953726, + "grad_norm": 0.004122686106711626, + "learning_rate": 8.771605752737829e-05, + "loss": 0.002, + "step": 4260 + }, + { + "epoch": 1.1247116208555796, + "grad_norm": 0.009832561016082764, + "learning_rate": 8.758411399920834e-05, + "loss": 0.0029, + "step": 4265 + }, + { + "epoch": 1.1260299255157866, + "grad_norm": 0.04854527860879898, + "learning_rate": 8.74521704710384e-05, + "loss": 0.0068, + "step": 4270 + }, + { + "epoch": 1.1273482301759936, + "grad_norm": 0.12221235036849976, + "learning_rate": 8.732022694286845e-05, + "loss": 0.003, + "step": 4275 + }, + { + "epoch": 1.1286665348362006, + "grad_norm": 0.005857539363205433, + "learning_rate": 8.718828341469851e-05, + "loss": 0.0022, + "step": 4280 + }, + { + "epoch": 1.1299848394964076, + "grad_norm": 0.10582758486270905, + "learning_rate": 8.705633988652856e-05, + "loss": 0.002, + "step": 4285 + }, + { + "epoch": 1.1313031441566146, + "grad_norm": 0.006190940272063017, + "learning_rate": 8.692439635835863e-05, + "loss": 0.0022, + "step": 4290 + }, + { + "epoch": 1.1326214488168216, + "grad_norm": 0.00221514655277133, + "learning_rate": 8.679245283018869e-05, + "loss": 0.0314, + "step": 4295 + }, + { + "epoch": 1.1339397534770286, + "grad_norm": 0.0796755850315094, + "learning_rate": 8.666050930201874e-05, + "loss": 0.0347, + "step": 4300 + }, + { + "epoch": 1.1352580581372356, + "grad_norm": 0.20088806748390198, + "learning_rate": 8.65285657738488e-05, + "loss": 0.0048, + "step": 4305 + }, + { + "epoch": 1.1365763627974426, + "grad_norm": 0.4018377363681793, + "learning_rate": 8.639662224567884e-05, + "loss": 0.0234, + "step": 4310 + }, + { + "epoch": 1.1378946674576496, + "grad_norm": 0.014961684122681618, + "learning_rate": 8.626467871750891e-05, + "loss": 0.0033, + "step": 4315 + }, + { + "epoch": 1.1392129721178565, + "grad_norm": 0.004534922540187836, + "learning_rate": 8.613273518933897e-05, + "loss": 0.0021, + "step": 4320 + }, + { + "epoch": 1.1405312767780633, + "grad_norm": 0.06340984255075455, + "learning_rate": 8.600079166116902e-05, + "loss": 0.0538, + "step": 4325 + }, + { + "epoch": 1.1418495814382703, + "grad_norm": 0.007374623324722052, + "learning_rate": 8.586884813299908e-05, + "loss": 0.0157, + "step": 4330 + }, + { + "epoch": 1.1431678860984773, + "grad_norm": 0.02313193492591381, + "learning_rate": 8.573690460482913e-05, + "loss": 0.0307, + "step": 4335 + }, + { + "epoch": 1.1444861907586843, + "grad_norm": 0.014071634039282799, + "learning_rate": 8.560496107665919e-05, + "loss": 0.0058, + "step": 4340 + }, + { + "epoch": 1.1458044954188913, + "grad_norm": 1.4664901494979858, + "learning_rate": 8.547301754848926e-05, + "loss": 0.0566, + "step": 4345 + }, + { + "epoch": 1.1471228000790983, + "grad_norm": 0.023680074140429497, + "learning_rate": 8.534107402031931e-05, + "loss": 0.0048, + "step": 4350 + }, + { + "epoch": 1.1484411047393053, + "grad_norm": 0.012555698864161968, + "learning_rate": 8.520913049214937e-05, + "loss": 0.0076, + "step": 4355 + }, + { + "epoch": 1.1497594093995123, + "grad_norm": 0.013624129816889763, + "learning_rate": 8.507718696397941e-05, + "loss": 0.0373, + "step": 4360 + }, + { + "epoch": 1.1510777140597193, + "grad_norm": 0.015372387133538723, + "learning_rate": 8.494524343580947e-05, + "loss": 0.0147, + "step": 4365 + }, + { + "epoch": 1.1523960187199263, + "grad_norm": 0.3312993347644806, + "learning_rate": 8.481329990763954e-05, + "loss": 0.0299, + "step": 4370 + }, + { + "epoch": 1.1537143233801332, + "grad_norm": 0.023838184773921967, + "learning_rate": 8.468135637946959e-05, + "loss": 0.0226, + "step": 4375 + }, + { + "epoch": 1.15503262804034, + "grad_norm": 0.42516952753067017, + "learning_rate": 8.454941285129965e-05, + "loss": 0.0088, + "step": 4380 + }, + { + "epoch": 1.156350932700547, + "grad_norm": 0.6900278925895691, + "learning_rate": 8.44174693231297e-05, + "loss": 0.0245, + "step": 4385 + }, + { + "epoch": 1.157669237360754, + "grad_norm": 0.2932703197002411, + "learning_rate": 8.428552579495976e-05, + "loss": 0.0207, + "step": 4390 + }, + { + "epoch": 1.158987542020961, + "grad_norm": 0.12942780554294586, + "learning_rate": 8.415358226678982e-05, + "loss": 0.0037, + "step": 4395 + }, + { + "epoch": 1.160305846681168, + "grad_norm": 0.9499046802520752, + "learning_rate": 8.402163873861989e-05, + "loss": 0.0246, + "step": 4400 + }, + { + "epoch": 1.161624151341375, + "grad_norm": 0.008869118988513947, + "learning_rate": 8.388969521044994e-05, + "loss": 0.0171, + "step": 4405 + }, + { + "epoch": 1.162942456001582, + "grad_norm": 1.7409231662750244, + "learning_rate": 8.375775168227998e-05, + "loss": 0.017, + "step": 4410 + }, + { + "epoch": 1.164260760661789, + "grad_norm": 0.0020101398695260286, + "learning_rate": 8.362580815411004e-05, + "loss": 0.0027, + "step": 4415 + }, + { + "epoch": 1.165579065321996, + "grad_norm": 0.0785067081451416, + "learning_rate": 8.34938646259401e-05, + "loss": 0.0043, + "step": 4420 + }, + { + "epoch": 1.166897369982203, + "grad_norm": 0.0029506285209208727, + "learning_rate": 8.336192109777016e-05, + "loss": 0.0109, + "step": 4425 + }, + { + "epoch": 1.16821567464241, + "grad_norm": 0.02216683328151703, + "learning_rate": 8.322997756960022e-05, + "loss": 0.0026, + "step": 4430 + }, + { + "epoch": 1.1695339793026167, + "grad_norm": 0.02216639369726181, + "learning_rate": 8.309803404143027e-05, + "loss": 0.0045, + "step": 4435 + }, + { + "epoch": 1.170852283962824, + "grad_norm": 0.0, + "learning_rate": 8.296609051326033e-05, + "loss": 0.006, + "step": 4440 + }, + { + "epoch": 1.1721705886230307, + "grad_norm": 0.0019736960530281067, + "learning_rate": 8.283414698509039e-05, + "loss": 0.0078, + "step": 4445 + }, + { + "epoch": 1.1734888932832377, + "grad_norm": 0.012957746163010597, + "learning_rate": 8.270220345692044e-05, + "loss": 0.002, + "step": 4450 + }, + { + "epoch": 1.1748071979434447, + "grad_norm": 0.010877869091928005, + "learning_rate": 8.25702599287505e-05, + "loss": 0.0237, + "step": 4455 + }, + { + "epoch": 1.1761255026036517, + "grad_norm": 0.005947659723460674, + "learning_rate": 8.243831640058055e-05, + "loss": 0.0341, + "step": 4460 + }, + { + "epoch": 1.1774438072638587, + "grad_norm": 0.0005026470171287656, + "learning_rate": 8.230637287241061e-05, + "loss": 0.0033, + "step": 4465 + }, + { + "epoch": 1.1787621119240657, + "grad_norm": 0.022054588422179222, + "learning_rate": 8.217442934424066e-05, + "loss": 0.0042, + "step": 4470 + }, + { + "epoch": 1.1800804165842727, + "grad_norm": 0.7929030656814575, + "learning_rate": 8.204248581607072e-05, + "loss": 0.0076, + "step": 4475 + }, + { + "epoch": 1.1813987212444796, + "grad_norm": 0.39052629470825195, + "learning_rate": 8.191054228790078e-05, + "loss": 0.0228, + "step": 4480 + }, + { + "epoch": 1.1827170259046866, + "grad_norm": 0.007177622988820076, + "learning_rate": 8.177859875973084e-05, + "loss": 0.01, + "step": 4485 + }, + { + "epoch": 1.1840353305648936, + "grad_norm": 0.006175135262310505, + "learning_rate": 8.16466552315609e-05, + "loss": 0.0037, + "step": 4490 + }, + { + "epoch": 1.1853536352251006, + "grad_norm": 0.0356481671333313, + "learning_rate": 8.151471170339096e-05, + "loss": 0.0024, + "step": 4495 + }, + { + "epoch": 1.1866719398853074, + "grad_norm": 0.19069480895996094, + "learning_rate": 8.138276817522101e-05, + "loss": 0.0048, + "step": 4500 + }, + { + "epoch": 1.1866719398853074, + "eval_loss": 0.026386437937617302, + "eval_runtime": 452.2896, + "eval_samples_per_second": 7.455, + "eval_steps_per_second": 3.728, + "step": 4500 + }, + { + "epoch": 1.1879902445455144, + "grad_norm": 0.002254961524158716, + "learning_rate": 8.125082464705107e-05, + "loss": 0.0014, + "step": 4505 + }, + { + "epoch": 1.1893085492057214, + "grad_norm": 0.8026870489120483, + "learning_rate": 8.111888111888112e-05, + "loss": 0.0411, + "step": 4510 + }, + { + "epoch": 1.1906268538659284, + "grad_norm": 0.47328072786331177, + "learning_rate": 8.098693759071118e-05, + "loss": 0.0271, + "step": 4515 + }, + { + "epoch": 1.1919451585261354, + "grad_norm": 0.4888288676738739, + "learning_rate": 8.085499406254123e-05, + "loss": 0.039, + "step": 4520 + }, + { + "epoch": 1.1932634631863424, + "grad_norm": 0.000925812462810427, + "learning_rate": 8.072305053437129e-05, + "loss": 0.0461, + "step": 4525 + }, + { + "epoch": 1.1945817678465493, + "grad_norm": 0.12472371757030487, + "learning_rate": 8.059110700620135e-05, + "loss": 0.0037, + "step": 4530 + }, + { + "epoch": 1.1959000725067563, + "grad_norm": 0.002875336678698659, + "learning_rate": 8.04591634780314e-05, + "loss": 0.0425, + "step": 4535 + }, + { + "epoch": 1.1972183771669633, + "grad_norm": 0.042056187987327576, + "learning_rate": 8.032721994986147e-05, + "loss": 0.0068, + "step": 4540 + }, + { + "epoch": 1.1985366818271703, + "grad_norm": 0.157605841755867, + "learning_rate": 8.019527642169153e-05, + "loss": 0.0179, + "step": 4545 + }, + { + "epoch": 1.1998549864873773, + "grad_norm": 0.005153563339263201, + "learning_rate": 8.006333289352158e-05, + "loss": 0.0045, + "step": 4550 + }, + { + "epoch": 1.201173291147584, + "grad_norm": 0.02541598491370678, + "learning_rate": 7.993138936535164e-05, + "loss": 0.0041, + "step": 4555 + }, + { + "epoch": 1.2024915958077913, + "grad_norm": 0.04266195371747017, + "learning_rate": 7.979944583718168e-05, + "loss": 0.0121, + "step": 4560 + }, + { + "epoch": 1.203809900467998, + "grad_norm": 0.36108532547950745, + "learning_rate": 7.966750230901175e-05, + "loss": 0.0147, + "step": 4565 + }, + { + "epoch": 1.205128205128205, + "grad_norm": 0.40405452251434326, + "learning_rate": 7.95355587808418e-05, + "loss": 0.0056, + "step": 4570 + }, + { + "epoch": 1.206446509788412, + "grad_norm": 0.030422702431678772, + "learning_rate": 7.940361525267186e-05, + "loss": 0.0055, + "step": 4575 + }, + { + "epoch": 1.207764814448619, + "grad_norm": 0.014555396512150764, + "learning_rate": 7.927167172450192e-05, + "loss": 0.0029, + "step": 4580 + }, + { + "epoch": 1.209083119108826, + "grad_norm": 0.33962950110435486, + "learning_rate": 7.913972819633197e-05, + "loss": 0.0191, + "step": 4585 + }, + { + "epoch": 1.210401423769033, + "grad_norm": 0.040150560438632965, + "learning_rate": 7.900778466816203e-05, + "loss": 0.0096, + "step": 4590 + }, + { + "epoch": 1.21171972842924, + "grad_norm": 0.2968510091304779, + "learning_rate": 7.88758411399921e-05, + "loss": 0.0311, + "step": 4595 + }, + { + "epoch": 1.213038033089447, + "grad_norm": 0.04709814116358757, + "learning_rate": 7.874389761182215e-05, + "loss": 0.0175, + "step": 4600 + }, + { + "epoch": 1.214356337749654, + "grad_norm": 0.1379537284374237, + "learning_rate": 7.861195408365221e-05, + "loss": 0.02, + "step": 4605 + }, + { + "epoch": 1.215674642409861, + "grad_norm": 0.018291711807250977, + "learning_rate": 7.848001055548225e-05, + "loss": 0.003, + "step": 4610 + }, + { + "epoch": 1.216992947070068, + "grad_norm": 0.041676126420497894, + "learning_rate": 7.83480670273123e-05, + "loss": 0.0054, + "step": 4615 + }, + { + "epoch": 1.2183112517302748, + "grad_norm": 0.0013747498160228133, + "learning_rate": 7.821612349914237e-05, + "loss": 0.0132, + "step": 4620 + }, + { + "epoch": 1.2196295563904818, + "grad_norm": 0.0050489697605371475, + "learning_rate": 7.808417997097243e-05, + "loss": 0.0272, + "step": 4625 + }, + { + "epoch": 1.2209478610506888, + "grad_norm": 0.017974581569433212, + "learning_rate": 7.795223644280249e-05, + "loss": 0.0037, + "step": 4630 + }, + { + "epoch": 1.2222661657108957, + "grad_norm": 0.001916698063723743, + "learning_rate": 7.782029291463254e-05, + "loss": 0.002, + "step": 4635 + }, + { + "epoch": 1.2235844703711027, + "grad_norm": 0.05344574153423309, + "learning_rate": 7.76883493864626e-05, + "loss": 0.0114, + "step": 4640 + }, + { + "epoch": 1.2249027750313097, + "grad_norm": 0.22823786735534668, + "learning_rate": 7.755640585829265e-05, + "loss": 0.0296, + "step": 4645 + }, + { + "epoch": 1.2262210796915167, + "grad_norm": 0.02051074244081974, + "learning_rate": 7.742446233012272e-05, + "loss": 0.0037, + "step": 4650 + }, + { + "epoch": 1.2275393843517237, + "grad_norm": 0.9797061681747437, + "learning_rate": 7.729251880195276e-05, + "loss": 0.011, + "step": 4655 + }, + { + "epoch": 1.2288576890119307, + "grad_norm": 0.0017285927897319198, + "learning_rate": 7.716057527378282e-05, + "loss": 0.0224, + "step": 4660 + }, + { + "epoch": 1.2301759936721377, + "grad_norm": 0.021783018484711647, + "learning_rate": 7.702863174561288e-05, + "loss": 0.0174, + "step": 4665 + }, + { + "epoch": 1.2314942983323447, + "grad_norm": 0.00763307698071003, + "learning_rate": 7.689668821744293e-05, + "loss": 0.0516, + "step": 4670 + }, + { + "epoch": 1.2328126029925515, + "grad_norm": 0.32605209946632385, + "learning_rate": 7.676474468927299e-05, + "loss": 0.0301, + "step": 4675 + }, + { + "epoch": 1.2341309076527585, + "grad_norm": 1.2027722597122192, + "learning_rate": 7.663280116110306e-05, + "loss": 0.0474, + "step": 4680 + }, + { + "epoch": 1.2354492123129655, + "grad_norm": 0.10201717168092728, + "learning_rate": 7.650085763293311e-05, + "loss": 0.0144, + "step": 4685 + }, + { + "epoch": 1.2367675169731724, + "grad_norm": 0.013835664838552475, + "learning_rate": 7.636891410476317e-05, + "loss": 0.0024, + "step": 4690 + }, + { + "epoch": 1.2380858216333794, + "grad_norm": 0.005699916277080774, + "learning_rate": 7.623697057659322e-05, + "loss": 0.0089, + "step": 4695 + }, + { + "epoch": 1.2394041262935864, + "grad_norm": 0.16583332419395447, + "learning_rate": 7.610502704842328e-05, + "loss": 0.019, + "step": 4700 + }, + { + "epoch": 1.2407224309537934, + "grad_norm": 0.2734023332595825, + "learning_rate": 7.597308352025333e-05, + "loss": 0.0041, + "step": 4705 + }, + { + "epoch": 1.2420407356140004, + "grad_norm": 0.04209504276514053, + "learning_rate": 7.584113999208339e-05, + "loss": 0.0292, + "step": 4710 + }, + { + "epoch": 1.2433590402742074, + "grad_norm": 0.0303195733577013, + "learning_rate": 7.570919646391345e-05, + "loss": 0.0019, + "step": 4715 + }, + { + "epoch": 1.2446773449344144, + "grad_norm": 0.014011899940669537, + "learning_rate": 7.55772529357435e-05, + "loss": 0.0236, + "step": 4720 + }, + { + "epoch": 1.2459956495946214, + "grad_norm": 0.37838876247406006, + "learning_rate": 7.544530940757356e-05, + "loss": 0.0081, + "step": 4725 + }, + { + "epoch": 1.2473139542548284, + "grad_norm": 0.003717717481777072, + "learning_rate": 7.531336587940361e-05, + "loss": 0.0036, + "step": 4730 + }, + { + "epoch": 1.2486322589150354, + "grad_norm": 1.2284752130508423, + "learning_rate": 7.518142235123368e-05, + "loss": 0.0089, + "step": 4735 + }, + { + "epoch": 1.2499505635752421, + "grad_norm": 0.015356095507740974, + "learning_rate": 7.504947882306374e-05, + "loss": 0.0074, + "step": 4740 + }, + { + "epoch": 1.2512688682354491, + "grad_norm": 0.0020383282098919153, + "learning_rate": 7.49175352948938e-05, + "loss": 0.0444, + "step": 4745 + }, + { + "epoch": 1.2525871728956561, + "grad_norm": 0.006680132355540991, + "learning_rate": 7.478559176672385e-05, + "loss": 0.009, + "step": 4750 + }, + { + "epoch": 1.2539054775558631, + "grad_norm": 0.01650019735097885, + "learning_rate": 7.465364823855389e-05, + "loss": 0.0022, + "step": 4755 + }, + { + "epoch": 1.2552237822160701, + "grad_norm": 0.009536102414131165, + "learning_rate": 7.452170471038396e-05, + "loss": 0.0026, + "step": 4760 + }, + { + "epoch": 1.256542086876277, + "grad_norm": 0.04677430912852287, + "learning_rate": 7.438976118221402e-05, + "loss": 0.004, + "step": 4765 + }, + { + "epoch": 1.257860391536484, + "grad_norm": 0.007777783088386059, + "learning_rate": 7.425781765404407e-05, + "loss": 0.0112, + "step": 4770 + }, + { + "epoch": 1.259178696196691, + "grad_norm": 0.03724197298288345, + "learning_rate": 7.412587412587413e-05, + "loss": 0.0065, + "step": 4775 + }, + { + "epoch": 1.260497000856898, + "grad_norm": 0.0023958412930369377, + "learning_rate": 7.399393059770418e-05, + "loss": 0.0238, + "step": 4780 + }, + { + "epoch": 1.261815305517105, + "grad_norm": 0.0036889975890517235, + "learning_rate": 7.386198706953424e-05, + "loss": 0.0012, + "step": 4785 + }, + { + "epoch": 1.263133610177312, + "grad_norm": 0.0009220903157256544, + "learning_rate": 7.373004354136431e-05, + "loss": 0.0017, + "step": 4790 + }, + { + "epoch": 1.2644519148375188, + "grad_norm": 0.0033395602367818356, + "learning_rate": 7.359810001319436e-05, + "loss": 0.0474, + "step": 4795 + }, + { + "epoch": 1.265770219497726, + "grad_norm": 0.004093261435627937, + "learning_rate": 7.346615648502442e-05, + "loss": 0.0025, + "step": 4800 + }, + { + "epoch": 1.2670885241579328, + "grad_norm": 0.004395488649606705, + "learning_rate": 7.333421295685446e-05, + "loss": 0.0011, + "step": 4805 + }, + { + "epoch": 1.2684068288181398, + "grad_norm": 0.024034051224589348, + "learning_rate": 7.320226942868452e-05, + "loss": 0.0027, + "step": 4810 + }, + { + "epoch": 1.2697251334783468, + "grad_norm": 0.9501499533653259, + "learning_rate": 7.307032590051459e-05, + "loss": 0.0279, + "step": 4815 + }, + { + "epoch": 1.2710434381385538, + "grad_norm": 0.008805549703538418, + "learning_rate": 7.293838237234464e-05, + "loss": 0.0403, + "step": 4820 + }, + { + "epoch": 1.2723617427987608, + "grad_norm": 0.01750873774290085, + "learning_rate": 7.28064388441747e-05, + "loss": 0.0571, + "step": 4825 + }, + { + "epoch": 1.2736800474589678, + "grad_norm": 0.004490260500460863, + "learning_rate": 7.267449531600475e-05, + "loss": 0.0269, + "step": 4830 + }, + { + "epoch": 1.2749983521191748, + "grad_norm": 0.07510064542293549, + "learning_rate": 7.254255178783481e-05, + "loss": 0.0123, + "step": 4835 + }, + { + "epoch": 1.2763166567793818, + "grad_norm": 0.039783038198947906, + "learning_rate": 7.241060825966486e-05, + "loss": 0.0137, + "step": 4840 + }, + { + "epoch": 1.2776349614395888, + "grad_norm": 0.019004900008440018, + "learning_rate": 7.227866473149493e-05, + "loss": 0.0047, + "step": 4845 + }, + { + "epoch": 1.2789532660997955, + "grad_norm": 0.04813052713871002, + "learning_rate": 7.214672120332499e-05, + "loss": 0.0021, + "step": 4850 + }, + { + "epoch": 1.2802715707600028, + "grad_norm": 0.00835048221051693, + "learning_rate": 7.201477767515503e-05, + "loss": 0.0014, + "step": 4855 + }, + { + "epoch": 1.2815898754202095, + "grad_norm": 0.008609198965132236, + "learning_rate": 7.188283414698509e-05, + "loss": 0.0219, + "step": 4860 + }, + { + "epoch": 1.2829081800804165, + "grad_norm": 0.007337458431720734, + "learning_rate": 7.175089061881514e-05, + "loss": 0.0014, + "step": 4865 + }, + { + "epoch": 1.2842264847406235, + "grad_norm": 0.0032645913306623697, + "learning_rate": 7.161894709064521e-05, + "loss": 0.0026, + "step": 4870 + }, + { + "epoch": 1.2855447894008305, + "grad_norm": 0.27384671568870544, + "learning_rate": 7.148700356247527e-05, + "loss": 0.0227, + "step": 4875 + }, + { + "epoch": 1.2868630940610375, + "grad_norm": 0.03584875538945198, + "learning_rate": 7.135506003430532e-05, + "loss": 0.0299, + "step": 4880 + }, + { + "epoch": 1.2881813987212445, + "grad_norm": 0.03482440486550331, + "learning_rate": 7.122311650613538e-05, + "loss": 0.0125, + "step": 4885 + }, + { + "epoch": 1.2894997033814515, + "grad_norm": 0.005974395200610161, + "learning_rate": 7.109117297796543e-05, + "loss": 0.0029, + "step": 4890 + }, + { + "epoch": 1.2908180080416585, + "grad_norm": 0.01820153370499611, + "learning_rate": 7.095922944979549e-05, + "loss": 0.0254, + "step": 4895 + }, + { + "epoch": 1.2921363127018655, + "grad_norm": 0.1733965277671814, + "learning_rate": 7.082728592162555e-05, + "loss": 0.028, + "step": 4900 + }, + { + "epoch": 1.2934546173620725, + "grad_norm": 1.3017303943634033, + "learning_rate": 7.06953423934556e-05, + "loss": 0.0213, + "step": 4905 + }, + { + "epoch": 1.2947729220222794, + "grad_norm": 0.01360877975821495, + "learning_rate": 7.056339886528566e-05, + "loss": 0.0039, + "step": 4910 + }, + { + "epoch": 1.2960912266824862, + "grad_norm": 0.01503999624401331, + "learning_rate": 7.043145533711571e-05, + "loss": 0.0102, + "step": 4915 + }, + { + "epoch": 1.2974095313426934, + "grad_norm": 0.2200804352760315, + "learning_rate": 7.029951180894577e-05, + "loss": 0.0461, + "step": 4920 + }, + { + "epoch": 1.2987278360029002, + "grad_norm": 0.08512946963310242, + "learning_rate": 7.016756828077582e-05, + "loss": 0.0066, + "step": 4925 + }, + { + "epoch": 1.3000461406631072, + "grad_norm": 0.08296570926904678, + "learning_rate": 7.00356247526059e-05, + "loss": 0.0223, + "step": 4930 + }, + { + "epoch": 1.3013644453233142, + "grad_norm": 0.008866079151630402, + "learning_rate": 6.990368122443595e-05, + "loss": 0.0032, + "step": 4935 + }, + { + "epoch": 1.3026827499835212, + "grad_norm": 0.024493014439940453, + "learning_rate": 6.9771737696266e-05, + "loss": 0.0128, + "step": 4940 + }, + { + "epoch": 1.3040010546437282, + "grad_norm": 0.08965341746807098, + "learning_rate": 6.963979416809606e-05, + "loss": 0.028, + "step": 4945 + }, + { + "epoch": 1.3053193593039352, + "grad_norm": 0.023156631737947464, + "learning_rate": 6.950785063992612e-05, + "loss": 0.0187, + "step": 4950 + }, + { + "epoch": 1.3066376639641422, + "grad_norm": 0.18552155792713165, + "learning_rate": 6.937590711175617e-05, + "loss": 0.0424, + "step": 4955 + }, + { + "epoch": 1.3079559686243492, + "grad_norm": 0.02200198918581009, + "learning_rate": 6.924396358358623e-05, + "loss": 0.0148, + "step": 4960 + }, + { + "epoch": 1.3092742732845561, + "grad_norm": 0.00568364467471838, + "learning_rate": 6.911202005541628e-05, + "loss": 0.0199, + "step": 4965 + }, + { + "epoch": 1.310592577944763, + "grad_norm": 0.021591177210211754, + "learning_rate": 6.898007652724634e-05, + "loss": 0.0092, + "step": 4970 + }, + { + "epoch": 1.3119108826049701, + "grad_norm": 0.327177494764328, + "learning_rate": 6.88481329990764e-05, + "loss": 0.0047, + "step": 4975 + }, + { + "epoch": 1.313229187265177, + "grad_norm": 0.024512887001037598, + "learning_rate": 6.871618947090645e-05, + "loss": 0.0046, + "step": 4980 + }, + { + "epoch": 1.314547491925384, + "grad_norm": 0.05725006014108658, + "learning_rate": 6.858424594273652e-05, + "loss": 0.0227, + "step": 4985 + }, + { + "epoch": 1.3158657965855909, + "grad_norm": 0.011280277743935585, + "learning_rate": 6.845230241456658e-05, + "loss": 0.0056, + "step": 4990 + }, + { + "epoch": 1.3171841012457979, + "grad_norm": 0.022504402324557304, + "learning_rate": 6.832035888639663e-05, + "loss": 0.0029, + "step": 4995 + }, + { + "epoch": 1.3185024059060049, + "grad_norm": 0.02168826013803482, + "learning_rate": 6.818841535822669e-05, + "loss": 0.0198, + "step": 5000 + }, + { + "epoch": 1.3185024059060049, + "eval_loss": 0.025039294734597206, + "eval_runtime": 452.1097, + "eval_samples_per_second": 7.458, + "eval_steps_per_second": 3.729, + "step": 5000 + }, + { + "epoch": 1.3198207105662119, + "grad_norm": 0.0064329709857702255, + "learning_rate": 6.805647183005673e-05, + "loss": 0.0299, + "step": 5005 + }, + { + "epoch": 1.3211390152264189, + "grad_norm": 0.00267885928042233, + "learning_rate": 6.79245283018868e-05, + "loss": 0.0065, + "step": 5010 + }, + { + "epoch": 1.3224573198866258, + "grad_norm": 0.6842889189720154, + "learning_rate": 6.779258477371685e-05, + "loss": 0.008, + "step": 5015 + }, + { + "epoch": 1.3237756245468328, + "grad_norm": 0.002985635306686163, + "learning_rate": 6.766064124554691e-05, + "loss": 0.0119, + "step": 5020 + }, + { + "epoch": 1.3250939292070396, + "grad_norm": 0.019304940477013588, + "learning_rate": 6.752869771737696e-05, + "loss": 0.0041, + "step": 5025 + }, + { + "epoch": 1.3264122338672468, + "grad_norm": 0.011305035091936588, + "learning_rate": 6.739675418920702e-05, + "loss": 0.0031, + "step": 5030 + }, + { + "epoch": 1.3277305385274536, + "grad_norm": 0.006184784695506096, + "learning_rate": 6.726481066103708e-05, + "loss": 0.0081, + "step": 5035 + }, + { + "epoch": 1.3290488431876606, + "grad_norm": 0.0073184361681342125, + "learning_rate": 6.713286713286715e-05, + "loss": 0.0202, + "step": 5040 + }, + { + "epoch": 1.3303671478478676, + "grad_norm": 0.006566181313246489, + "learning_rate": 6.70009236046972e-05, + "loss": 0.0052, + "step": 5045 + }, + { + "epoch": 1.3316854525080746, + "grad_norm": 0.31427526473999023, + "learning_rate": 6.686898007652726e-05, + "loss": 0.017, + "step": 5050 + }, + { + "epoch": 1.3330037571682816, + "grad_norm": 0.005085447803139687, + "learning_rate": 6.67370365483573e-05, + "loss": 0.009, + "step": 5055 + }, + { + "epoch": 1.3343220618284886, + "grad_norm": 0.2745366096496582, + "learning_rate": 6.660509302018735e-05, + "loss": 0.0119, + "step": 5060 + }, + { + "epoch": 1.3356403664886956, + "grad_norm": 0.2871796786785126, + "learning_rate": 6.647314949201742e-05, + "loss": 0.0158, + "step": 5065 + }, + { + "epoch": 1.3369586711489025, + "grad_norm": 0.2774186134338379, + "learning_rate": 6.634120596384748e-05, + "loss": 0.0084, + "step": 5070 + }, + { + "epoch": 1.3382769758091095, + "grad_norm": 0.013278775848448277, + "learning_rate": 6.620926243567753e-05, + "loss": 0.0111, + "step": 5075 + }, + { + "epoch": 1.3395952804693165, + "grad_norm": 0.01614517532289028, + "learning_rate": 6.607731890750759e-05, + "loss": 0.0066, + "step": 5080 + }, + { + "epoch": 1.3409135851295235, + "grad_norm": 0.0037789656780660152, + "learning_rate": 6.594537537933765e-05, + "loss": 0.0142, + "step": 5085 + }, + { + "epoch": 1.3422318897897303, + "grad_norm": 0.03221861273050308, + "learning_rate": 6.58134318511677e-05, + "loss": 0.0155, + "step": 5090 + }, + { + "epoch": 1.3435501944499375, + "grad_norm": 0.005637989845126867, + "learning_rate": 6.568148832299776e-05, + "loss": 0.0022, + "step": 5095 + }, + { + "epoch": 1.3448684991101443, + "grad_norm": 0.0017844432732090354, + "learning_rate": 6.554954479482783e-05, + "loss": 0.0217, + "step": 5100 + }, + { + "epoch": 1.3461868037703513, + "grad_norm": 0.08099021762609482, + "learning_rate": 6.541760126665787e-05, + "loss": 0.0222, + "step": 5105 + }, + { + "epoch": 1.3475051084305583, + "grad_norm": 0.011909045279026031, + "learning_rate": 6.528565773848792e-05, + "loss": 0.0058, + "step": 5110 + }, + { + "epoch": 1.3488234130907653, + "grad_norm": 0.7332578301429749, + "learning_rate": 6.515371421031798e-05, + "loss": 0.0286, + "step": 5115 + }, + { + "epoch": 1.3501417177509722, + "grad_norm": 0.3415885865688324, + "learning_rate": 6.502177068214804e-05, + "loss": 0.1191, + "step": 5120 + }, + { + "epoch": 1.3514600224111792, + "grad_norm": 0.00904211588203907, + "learning_rate": 6.48898271539781e-05, + "loss": 0.0043, + "step": 5125 + }, + { + "epoch": 1.3527783270713862, + "grad_norm": 0.1978830248117447, + "learning_rate": 6.475788362580816e-05, + "loss": 0.0316, + "step": 5130 + }, + { + "epoch": 1.3540966317315932, + "grad_norm": 0.10229042172431946, + "learning_rate": 6.462594009763822e-05, + "loss": 0.0194, + "step": 5135 + }, + { + "epoch": 1.3554149363918002, + "grad_norm": 0.4457210600376129, + "learning_rate": 6.449399656946827e-05, + "loss": 0.0276, + "step": 5140 + }, + { + "epoch": 1.356733241052007, + "grad_norm": 0.023706572130322456, + "learning_rate": 6.436205304129833e-05, + "loss": 0.0163, + "step": 5145 + }, + { + "epoch": 1.3580515457122142, + "grad_norm": 1.166896939277649, + "learning_rate": 6.423010951312838e-05, + "loss": 0.0189, + "step": 5150 + }, + { + "epoch": 1.359369850372421, + "grad_norm": 0.0016115796752274036, + "learning_rate": 6.409816598495844e-05, + "loss": 0.0191, + "step": 5155 + }, + { + "epoch": 1.360688155032628, + "grad_norm": 0.00786682777106762, + "learning_rate": 6.39662224567885e-05, + "loss": 0.0119, + "step": 5160 + }, + { + "epoch": 1.362006459692835, + "grad_norm": 1.042732834815979, + "learning_rate": 6.383427892861855e-05, + "loss": 0.0497, + "step": 5165 + }, + { + "epoch": 1.363324764353042, + "grad_norm": 0.007983304560184479, + "learning_rate": 6.37023354004486e-05, + "loss": 0.044, + "step": 5170 + }, + { + "epoch": 1.364643069013249, + "grad_norm": 0.009767642244696617, + "learning_rate": 6.357039187227866e-05, + "loss": 0.0405, + "step": 5175 + }, + { + "epoch": 1.365961373673456, + "grad_norm": 0.03164628520607948, + "learning_rate": 6.343844834410873e-05, + "loss": 0.0138, + "step": 5180 + }, + { + "epoch": 1.367279678333663, + "grad_norm": 0.004159921780228615, + "learning_rate": 6.330650481593879e-05, + "loss": 0.0045, + "step": 5185 + }, + { + "epoch": 1.36859798299387, + "grad_norm": 0.004395391326397657, + "learning_rate": 6.317456128776884e-05, + "loss": 0.0046, + "step": 5190 + }, + { + "epoch": 1.369916287654077, + "grad_norm": 0.011886746622622013, + "learning_rate": 6.30426177595989e-05, + "loss": 0.0064, + "step": 5195 + }, + { + "epoch": 1.371234592314284, + "grad_norm": 0.2259266972541809, + "learning_rate": 6.291067423142895e-05, + "loss": 0.0076, + "step": 5200 + }, + { + "epoch": 1.372552896974491, + "grad_norm": 0.01407301053404808, + "learning_rate": 6.277873070325901e-05, + "loss": 0.0201, + "step": 5205 + }, + { + "epoch": 1.3738712016346977, + "grad_norm": 0.00911578256636858, + "learning_rate": 6.264678717508906e-05, + "loss": 0.0164, + "step": 5210 + }, + { + "epoch": 1.3751895062949049, + "grad_norm": 0.20968014001846313, + "learning_rate": 6.251484364691912e-05, + "loss": 0.0075, + "step": 5215 + }, + { + "epoch": 1.3765078109551117, + "grad_norm": 0.008801166899502277, + "learning_rate": 6.238290011874918e-05, + "loss": 0.0068, + "step": 5220 + }, + { + "epoch": 1.3778261156153186, + "grad_norm": 0.007181806955486536, + "learning_rate": 6.225095659057923e-05, + "loss": 0.0136, + "step": 5225 + }, + { + "epoch": 1.3791444202755256, + "grad_norm": 0.7527109980583191, + "learning_rate": 6.211901306240929e-05, + "loss": 0.0287, + "step": 5230 + }, + { + "epoch": 1.3804627249357326, + "grad_norm": 0.039015207439661026, + "learning_rate": 6.198706953423936e-05, + "loss": 0.0326, + "step": 5235 + }, + { + "epoch": 1.3817810295959396, + "grad_norm": 0.021076606586575508, + "learning_rate": 6.185512600606941e-05, + "loss": 0.0191, + "step": 5240 + }, + { + "epoch": 1.3830993342561466, + "grad_norm": 0.016630731523036957, + "learning_rate": 6.172318247789947e-05, + "loss": 0.0131, + "step": 5245 + }, + { + "epoch": 1.3844176389163536, + "grad_norm": 0.011133644729852676, + "learning_rate": 6.159123894972952e-05, + "loss": 0.0029, + "step": 5250 + }, + { + "epoch": 1.3857359435765606, + "grad_norm": 0.6434677243232727, + "learning_rate": 6.145929542155957e-05, + "loss": 0.0091, + "step": 5255 + }, + { + "epoch": 1.3870542482367676, + "grad_norm": 0.051020298153162, + "learning_rate": 6.132735189338964e-05, + "loss": 0.0086, + "step": 5260 + }, + { + "epoch": 1.3883725528969744, + "grad_norm": 0.016413932666182518, + "learning_rate": 6.119540836521969e-05, + "loss": 0.0061, + "step": 5265 + }, + { + "epoch": 1.3896908575571816, + "grad_norm": 0.005769540090113878, + "learning_rate": 6.106346483704975e-05, + "loss": 0.0027, + "step": 5270 + }, + { + "epoch": 1.3910091622173884, + "grad_norm": 0.06687796860933304, + "learning_rate": 6.09315213088798e-05, + "loss": 0.0423, + "step": 5275 + }, + { + "epoch": 1.3923274668775953, + "grad_norm": 0.005641553085297346, + "learning_rate": 6.079957778070986e-05, + "loss": 0.0353, + "step": 5280 + }, + { + "epoch": 1.3936457715378023, + "grad_norm": 0.04460568353533745, + "learning_rate": 6.066763425253992e-05, + "loss": 0.0041, + "step": 5285 + }, + { + "epoch": 1.3949640761980093, + "grad_norm": 0.0387534461915493, + "learning_rate": 6.0535690724369976e-05, + "loss": 0.006, + "step": 5290 + }, + { + "epoch": 1.3962823808582163, + "grad_norm": 0.010292598977684975, + "learning_rate": 6.040374719620003e-05, + "loss": 0.0038, + "step": 5295 + }, + { + "epoch": 1.3976006855184233, + "grad_norm": 0.3646155297756195, + "learning_rate": 6.0271803668030094e-05, + "loss": 0.0111, + "step": 5300 + }, + { + "epoch": 1.3989189901786303, + "grad_norm": 0.022035539150238037, + "learning_rate": 6.0139860139860136e-05, + "loss": 0.0507, + "step": 5305 + }, + { + "epoch": 1.4002372948388373, + "grad_norm": 0.003314939560368657, + "learning_rate": 6.00079166116902e-05, + "loss": 0.0132, + "step": 5310 + }, + { + "epoch": 1.4015555994990443, + "grad_norm": 0.0838267058134079, + "learning_rate": 5.9875973083520254e-05, + "loss": 0.0105, + "step": 5315 + }, + { + "epoch": 1.4028739041592513, + "grad_norm": 0.009368584491312504, + "learning_rate": 5.974402955535031e-05, + "loss": 0.0026, + "step": 5320 + }, + { + "epoch": 1.4041922088194583, + "grad_norm": 0.031248098239302635, + "learning_rate": 5.961208602718037e-05, + "loss": 0.0151, + "step": 5325 + }, + { + "epoch": 1.405510513479665, + "grad_norm": 0.06447605788707733, + "learning_rate": 5.948014249901043e-05, + "loss": 0.0219, + "step": 5330 + }, + { + "epoch": 1.4068288181398723, + "grad_norm": 0.010814374312758446, + "learning_rate": 5.9348198970840484e-05, + "loss": 0.0038, + "step": 5335 + }, + { + "epoch": 1.408147122800079, + "grad_norm": 0.6235967874526978, + "learning_rate": 5.9216255442670546e-05, + "loss": 0.0354, + "step": 5340 + }, + { + "epoch": 1.409465427460286, + "grad_norm": 0.026741521432995796, + "learning_rate": 5.90843119145006e-05, + "loss": 0.0032, + "step": 5345 + }, + { + "epoch": 1.410783732120493, + "grad_norm": 0.019413433969020844, + "learning_rate": 5.895236838633066e-05, + "loss": 0.0216, + "step": 5350 + }, + { + "epoch": 1.4121020367807, + "grad_norm": 0.0735543966293335, + "learning_rate": 5.8820424858160706e-05, + "loss": 0.0033, + "step": 5355 + }, + { + "epoch": 1.413420341440907, + "grad_norm": 0.005189546383917332, + "learning_rate": 5.868848132999076e-05, + "loss": 0.021, + "step": 5360 + }, + { + "epoch": 1.414738646101114, + "grad_norm": 0.21240335702896118, + "learning_rate": 5.8556537801820824e-05, + "loss": 0.0294, + "step": 5365 + }, + { + "epoch": 1.416056950761321, + "grad_norm": 0.010165920481085777, + "learning_rate": 5.842459427365088e-05, + "loss": 0.0021, + "step": 5370 + }, + { + "epoch": 1.417375255421528, + "grad_norm": 0.026774069294333458, + "learning_rate": 5.8292650745480936e-05, + "loss": 0.0299, + "step": 5375 + }, + { + "epoch": 1.418693560081735, + "grad_norm": 0.0019810455851256847, + "learning_rate": 5.816070721731099e-05, + "loss": 0.0029, + "step": 5380 + }, + { + "epoch": 1.4200118647419417, + "grad_norm": 0.038888879120349884, + "learning_rate": 5.8028763689141054e-05, + "loss": 0.0069, + "step": 5385 + }, + { + "epoch": 1.421330169402149, + "grad_norm": 0.016180936247110367, + "learning_rate": 5.789682016097111e-05, + "loss": 0.0032, + "step": 5390 + }, + { + "epoch": 1.4226484740623557, + "grad_norm": 0.01119404286146164, + "learning_rate": 5.7764876632801165e-05, + "loss": 0.0024, + "step": 5395 + }, + { + "epoch": 1.4239667787225627, + "grad_norm": 0.010486694052815437, + "learning_rate": 5.763293310463123e-05, + "loss": 0.0324, + "step": 5400 + }, + { + "epoch": 1.4252850833827697, + "grad_norm": 0.005453066434711218, + "learning_rate": 5.750098957646127e-05, + "loss": 0.0038, + "step": 5405 + }, + { + "epoch": 1.4266033880429767, + "grad_norm": 0.17556461691856384, + "learning_rate": 5.736904604829133e-05, + "loss": 0.0305, + "step": 5410 + }, + { + "epoch": 1.4279216927031837, + "grad_norm": 0.03074715845286846, + "learning_rate": 5.723710252012139e-05, + "loss": 0.003, + "step": 5415 + }, + { + "epoch": 1.4292399973633907, + "grad_norm": 1.7238941192626953, + "learning_rate": 5.710515899195144e-05, + "loss": 0.0254, + "step": 5420 + }, + { + "epoch": 1.4305583020235977, + "grad_norm": 0.012462320737540722, + "learning_rate": 5.6973215463781506e-05, + "loss": 0.0018, + "step": 5425 + }, + { + "epoch": 1.4318766066838047, + "grad_norm": 0.021576853469014168, + "learning_rate": 5.684127193561156e-05, + "loss": 0.0472, + "step": 5430 + }, + { + "epoch": 1.4331949113440117, + "grad_norm": 0.2862134575843811, + "learning_rate": 5.670932840744162e-05, + "loss": 0.0258, + "step": 5435 + }, + { + "epoch": 1.4345132160042184, + "grad_norm": 0.28419312834739685, + "learning_rate": 5.657738487927168e-05, + "loss": 0.0053, + "step": 5440 + }, + { + "epoch": 1.4358315206644257, + "grad_norm": 0.013650139793753624, + "learning_rate": 5.6445441351101735e-05, + "loss": 0.0126, + "step": 5445 + }, + { + "epoch": 1.4371498253246324, + "grad_norm": 0.01203097216784954, + "learning_rate": 5.631349782293179e-05, + "loss": 0.0076, + "step": 5450 + }, + { + "epoch": 1.4384681299848394, + "grad_norm": 0.0881054624915123, + "learning_rate": 5.618155429476184e-05, + "loss": 0.0178, + "step": 5455 + }, + { + "epoch": 1.4397864346450464, + "grad_norm": 0.5258516669273376, + "learning_rate": 5.6049610766591895e-05, + "loss": 0.0112, + "step": 5460 + }, + { + "epoch": 1.4411047393052534, + "grad_norm": 0.001202153041958809, + "learning_rate": 5.591766723842196e-05, + "loss": 0.0089, + "step": 5465 + }, + { + "epoch": 1.4424230439654604, + "grad_norm": 0.4498993456363678, + "learning_rate": 5.5785723710252014e-05, + "loss": 0.0252, + "step": 5470 + }, + { + "epoch": 1.4437413486256674, + "grad_norm": 0.17477644979953766, + "learning_rate": 5.565378018208207e-05, + "loss": 0.0169, + "step": 5475 + }, + { + "epoch": 1.4450596532858744, + "grad_norm": 0.019443338736891747, + "learning_rate": 5.552183665391213e-05, + "loss": 0.0019, + "step": 5480 + }, + { + "epoch": 1.4463779579460814, + "grad_norm": 0.005653039086610079, + "learning_rate": 5.538989312574219e-05, + "loss": 0.0231, + "step": 5485 + }, + { + "epoch": 1.4476962626062884, + "grad_norm": 0.01554112322628498, + "learning_rate": 5.525794959757224e-05, + "loss": 0.0167, + "step": 5490 + }, + { + "epoch": 1.4490145672664954, + "grad_norm": 0.044272180646657944, + "learning_rate": 5.5126006069402305e-05, + "loss": 0.007, + "step": 5495 + }, + { + "epoch": 1.4503328719267023, + "grad_norm": 0.014857172966003418, + "learning_rate": 5.499406254123236e-05, + "loss": 0.0045, + "step": 5500 + }, + { + "epoch": 1.4503328719267023, + "eval_loss": 0.02392147295176983, + "eval_runtime": 452.468, + "eval_samples_per_second": 7.452, + "eval_steps_per_second": 3.726, + "step": 5500 + }, + { + "epoch": 1.4516511765869091, + "grad_norm": 0.007390835788100958, + "learning_rate": 5.486211901306241e-05, + "loss": 0.0171, + "step": 5505 + }, + { + "epoch": 1.4529694812471163, + "grad_norm": 0.0050474610179662704, + "learning_rate": 5.4730175484892466e-05, + "loss": 0.004, + "step": 5510 + }, + { + "epoch": 1.454287785907323, + "grad_norm": 0.08066163957118988, + "learning_rate": 5.459823195672252e-05, + "loss": 0.0103, + "step": 5515 + }, + { + "epoch": 1.45560609056753, + "grad_norm": 0.0062376330606639385, + "learning_rate": 5.4466288428552584e-05, + "loss": 0.0066, + "step": 5520 + }, + { + "epoch": 1.456924395227737, + "grad_norm": 0.00711809890344739, + "learning_rate": 5.433434490038264e-05, + "loss": 0.003, + "step": 5525 + }, + { + "epoch": 1.458242699887944, + "grad_norm": 0.004010149277746677, + "learning_rate": 5.4202401372212695e-05, + "loss": 0.0231, + "step": 5530 + }, + { + "epoch": 1.459561004548151, + "grad_norm": 0.4791967272758484, + "learning_rate": 5.407045784404276e-05, + "loss": 0.0277, + "step": 5535 + }, + { + "epoch": 1.460879309208358, + "grad_norm": 0.03979189693927765, + "learning_rate": 5.393851431587281e-05, + "loss": 0.0033, + "step": 5540 + }, + { + "epoch": 1.462197613868565, + "grad_norm": 0.03331119939684868, + "learning_rate": 5.380657078770287e-05, + "loss": 0.0187, + "step": 5545 + }, + { + "epoch": 1.463515918528772, + "grad_norm": 0.0042802803218364716, + "learning_rate": 5.367462725953293e-05, + "loss": 0.0032, + "step": 5550 + }, + { + "epoch": 1.464834223188979, + "grad_norm": 0.05439918115735054, + "learning_rate": 5.354268373136297e-05, + "loss": 0.0043, + "step": 5555 + }, + { + "epoch": 1.4661525278491858, + "grad_norm": 0.042643506079912186, + "learning_rate": 5.3410740203193036e-05, + "loss": 0.0059, + "step": 5560 + }, + { + "epoch": 1.467470832509393, + "grad_norm": 0.023453116416931152, + "learning_rate": 5.327879667502309e-05, + "loss": 0.0043, + "step": 5565 + }, + { + "epoch": 1.4687891371695998, + "grad_norm": 0.037712760269641876, + "learning_rate": 5.314685314685315e-05, + "loss": 0.0033, + "step": 5570 + }, + { + "epoch": 1.4701074418298068, + "grad_norm": 1.0485608577728271, + "learning_rate": 5.301490961868321e-05, + "loss": 0.0489, + "step": 5575 + }, + { + "epoch": 1.4714257464900138, + "grad_norm": 0.004728829488158226, + "learning_rate": 5.2882966090513265e-05, + "loss": 0.0067, + "step": 5580 + }, + { + "epoch": 1.4727440511502208, + "grad_norm": 0.027893677353858948, + "learning_rate": 5.275102256234332e-05, + "loss": 0.0208, + "step": 5585 + }, + { + "epoch": 1.4740623558104278, + "grad_norm": 0.02256879396736622, + "learning_rate": 5.2619079034173377e-05, + "loss": 0.0036, + "step": 5590 + }, + { + "epoch": 1.4753806604706348, + "grad_norm": 0.12636558711528778, + "learning_rate": 5.248713550600344e-05, + "loss": 0.0046, + "step": 5595 + }, + { + "epoch": 1.4766989651308418, + "grad_norm": 0.000997041119262576, + "learning_rate": 5.235519197783348e-05, + "loss": 0.0101, + "step": 5600 + }, + { + "epoch": 1.4780172697910487, + "grad_norm": 0.023494020104408264, + "learning_rate": 5.2223248449663543e-05, + "loss": 0.0039, + "step": 5605 + }, + { + "epoch": 1.4793355744512557, + "grad_norm": 0.01525307446718216, + "learning_rate": 5.20913049214936e-05, + "loss": 0.021, + "step": 5610 + }, + { + "epoch": 1.4806538791114627, + "grad_norm": 0.0024215306621044874, + "learning_rate": 5.1959361393323655e-05, + "loss": 0.0017, + "step": 5615 + }, + { + "epoch": 1.4819721837716697, + "grad_norm": 1.4708061218261719, + "learning_rate": 5.182741786515372e-05, + "loss": 0.04, + "step": 5620 + }, + { + "epoch": 1.4832904884318765, + "grad_norm": 0.015033531002700329, + "learning_rate": 5.169547433698377e-05, + "loss": 0.0042, + "step": 5625 + }, + { + "epoch": 1.4846087930920837, + "grad_norm": 0.0035444959066808224, + "learning_rate": 5.156353080881383e-05, + "loss": 0.0087, + "step": 5630 + }, + { + "epoch": 1.4859270977522905, + "grad_norm": 0.010087919421494007, + "learning_rate": 5.143158728064389e-05, + "loss": 0.0158, + "step": 5635 + }, + { + "epoch": 1.4872454024124975, + "grad_norm": 0.05779251083731651, + "learning_rate": 5.129964375247395e-05, + "loss": 0.0157, + "step": 5640 + }, + { + "epoch": 1.4885637070727045, + "grad_norm": 0.14927980303764343, + "learning_rate": 5.1167700224304e-05, + "loss": 0.0257, + "step": 5645 + }, + { + "epoch": 1.4898820117329115, + "grad_norm": 0.004252352751791477, + "learning_rate": 5.103575669613405e-05, + "loss": 0.0198, + "step": 5650 + }, + { + "epoch": 1.4912003163931185, + "grad_norm": 0.0029206848703324795, + "learning_rate": 5.090381316796411e-05, + "loss": 0.0016, + "step": 5655 + }, + { + "epoch": 1.4925186210533254, + "grad_norm": 0.005047530401498079, + "learning_rate": 5.077186963979417e-05, + "loss": 0.0023, + "step": 5660 + }, + { + "epoch": 1.4938369257135324, + "grad_norm": 0.003732564626261592, + "learning_rate": 5.0639926111624225e-05, + "loss": 0.0336, + "step": 5665 + }, + { + "epoch": 1.4951552303737394, + "grad_norm": 0.3832889497280121, + "learning_rate": 5.050798258345428e-05, + "loss": 0.0476, + "step": 5670 + }, + { + "epoch": 1.4964735350339464, + "grad_norm": 0.06733009219169617, + "learning_rate": 5.037603905528434e-05, + "loss": 0.0044, + "step": 5675 + }, + { + "epoch": 1.4977918396941532, + "grad_norm": 0.008067069575190544, + "learning_rate": 5.02440955271144e-05, + "loss": 0.0035, + "step": 5680 + }, + { + "epoch": 1.4991101443543604, + "grad_norm": 0.01706300489604473, + "learning_rate": 5.0112151998944454e-05, + "loss": 0.0031, + "step": 5685 + }, + { + "epoch": 1.5004284490145672, + "grad_norm": 0.009932024404406548, + "learning_rate": 4.998020847077451e-05, + "loss": 0.0587, + "step": 5690 + }, + { + "epoch": 1.5017467536747744, + "grad_norm": 0.006488936021924019, + "learning_rate": 4.9848264942604566e-05, + "loss": 0.002, + "step": 5695 + }, + { + "epoch": 1.5030650583349812, + "grad_norm": 0.17488756775856018, + "learning_rate": 4.971632141443462e-05, + "loss": 0.0245, + "step": 5700 + }, + { + "epoch": 1.5043833629951882, + "grad_norm": 0.3327178359031677, + "learning_rate": 4.9584377886264684e-05, + "loss": 0.0404, + "step": 5705 + }, + { + "epoch": 1.5057016676553951, + "grad_norm": 0.18467263877391815, + "learning_rate": 4.945243435809474e-05, + "loss": 0.0248, + "step": 5710 + }, + { + "epoch": 1.5070199723156021, + "grad_norm": 0.020061776041984558, + "learning_rate": 4.9320490829924795e-05, + "loss": 0.0034, + "step": 5715 + }, + { + "epoch": 1.5083382769758091, + "grad_norm": 0.0005288647953420877, + "learning_rate": 4.918854730175485e-05, + "loss": 0.0076, + "step": 5720 + }, + { + "epoch": 1.5096565816360161, + "grad_norm": 0.007515576668083668, + "learning_rate": 4.9056603773584906e-05, + "loss": 0.004, + "step": 5725 + }, + { + "epoch": 1.5109748862962231, + "grad_norm": 0.05365758761763573, + "learning_rate": 4.892466024541497e-05, + "loss": 0.0222, + "step": 5730 + }, + { + "epoch": 1.51229319095643, + "grad_norm": 0.00572391040623188, + "learning_rate": 4.8792716717245025e-05, + "loss": 0.0132, + "step": 5735 + }, + { + "epoch": 1.513611495616637, + "grad_norm": 0.21178627014160156, + "learning_rate": 4.8660773189075073e-05, + "loss": 0.0417, + "step": 5740 + }, + { + "epoch": 1.5149298002768439, + "grad_norm": 0.0641486868262291, + "learning_rate": 4.8528829660905136e-05, + "loss": 0.011, + "step": 5745 + }, + { + "epoch": 1.516248104937051, + "grad_norm": 0.04451924189925194, + "learning_rate": 4.839688613273519e-05, + "loss": 0.012, + "step": 5750 + }, + { + "epoch": 1.5175664095972579, + "grad_norm": 0.019951259717345238, + "learning_rate": 4.826494260456525e-05, + "loss": 0.009, + "step": 5755 + }, + { + "epoch": 1.5188847142574649, + "grad_norm": 0.021919893100857735, + "learning_rate": 4.813299907639531e-05, + "loss": 0.0081, + "step": 5760 + }, + { + "epoch": 1.5202030189176718, + "grad_norm": 0.5730367302894592, + "learning_rate": 4.800105554822536e-05, + "loss": 0.0254, + "step": 5765 + }, + { + "epoch": 1.5215213235778788, + "grad_norm": 0.02501523122191429, + "learning_rate": 4.786911202005542e-05, + "loss": 0.0045, + "step": 5770 + }, + { + "epoch": 1.5228396282380858, + "grad_norm": 0.01574208028614521, + "learning_rate": 4.773716849188548e-05, + "loss": 0.0081, + "step": 5775 + }, + { + "epoch": 1.5241579328982928, + "grad_norm": 0.009626791812479496, + "learning_rate": 4.760522496371553e-05, + "loss": 0.0037, + "step": 5780 + }, + { + "epoch": 1.5254762375584998, + "grad_norm": 0.535539448261261, + "learning_rate": 4.747328143554559e-05, + "loss": 0.0149, + "step": 5785 + }, + { + "epoch": 1.5267945422187066, + "grad_norm": 0.004934845492243767, + "learning_rate": 4.7341337907375644e-05, + "loss": 0.0048, + "step": 5790 + }, + { + "epoch": 1.5281128468789138, + "grad_norm": 0.009070080704987049, + "learning_rate": 4.72093943792057e-05, + "loss": 0.0028, + "step": 5795 + }, + { + "epoch": 1.5294311515391206, + "grad_norm": 0.0040720063261687756, + "learning_rate": 4.707745085103576e-05, + "loss": 0.0016, + "step": 5800 + }, + { + "epoch": 1.5307494561993278, + "grad_norm": 0.45212000608444214, + "learning_rate": 4.694550732286582e-05, + "loss": 0.0111, + "step": 5805 + }, + { + "epoch": 1.5320677608595346, + "grad_norm": 0.024048497900366783, + "learning_rate": 4.681356379469587e-05, + "loss": 0.0149, + "step": 5810 + }, + { + "epoch": 1.5333860655197418, + "grad_norm": 0.11899136006832123, + "learning_rate": 4.668162026652593e-05, + "loss": 0.0034, + "step": 5815 + }, + { + "epoch": 1.5347043701799485, + "grad_norm": 0.011249657720327377, + "learning_rate": 4.6549676738355984e-05, + "loss": 0.0052, + "step": 5820 + }, + { + "epoch": 1.5360226748401555, + "grad_norm": 0.051634710282087326, + "learning_rate": 4.641773321018604e-05, + "loss": 0.0031, + "step": 5825 + }, + { + "epoch": 1.5373409795003625, + "grad_norm": 0.3726826012134552, + "learning_rate": 4.62857896820161e-05, + "loss": 0.0582, + "step": 5830 + }, + { + "epoch": 1.5386592841605695, + "grad_norm": 0.5827310681343079, + "learning_rate": 4.615384615384616e-05, + "loss": 0.0652, + "step": 5835 + }, + { + "epoch": 1.5399775888207765, + "grad_norm": 0.006390869617462158, + "learning_rate": 4.6021902625676214e-05, + "loss": 0.0022, + "step": 5840 + }, + { + "epoch": 1.5412958934809835, + "grad_norm": 0.022760871797800064, + "learning_rate": 4.588995909750627e-05, + "loss": 0.0311, + "step": 5845 + }, + { + "epoch": 1.5426141981411905, + "grad_norm": 0.22773241996765137, + "learning_rate": 4.5758015569336325e-05, + "loss": 0.0051, + "step": 5850 + }, + { + "epoch": 1.5439325028013973, + "grad_norm": 0.015375247225165367, + "learning_rate": 4.562607204116639e-05, + "loss": 0.0023, + "step": 5855 + }, + { + "epoch": 1.5452508074616045, + "grad_norm": 0.007347101345658302, + "learning_rate": 4.549412851299644e-05, + "loss": 0.0437, + "step": 5860 + }, + { + "epoch": 1.5465691121218113, + "grad_norm": 0.012344900518655777, + "learning_rate": 4.536218498482649e-05, + "loss": 0.004, + "step": 5865 + }, + { + "epoch": 1.5478874167820185, + "grad_norm": 0.27038896083831787, + "learning_rate": 4.5230241456656555e-05, + "loss": 0.0047, + "step": 5870 + }, + { + "epoch": 1.5492057214422252, + "grad_norm": 0.016395213082432747, + "learning_rate": 4.509829792848661e-05, + "loss": 0.0026, + "step": 5875 + }, + { + "epoch": 1.5505240261024322, + "grad_norm": 0.4217267632484436, + "learning_rate": 4.4966354400316666e-05, + "loss": 0.0364, + "step": 5880 + }, + { + "epoch": 1.5518423307626392, + "grad_norm": 0.20046105980873108, + "learning_rate": 4.483441087214673e-05, + "loss": 0.0243, + "step": 5885 + }, + { + "epoch": 1.5531606354228462, + "grad_norm": 0.004307698458433151, + "learning_rate": 4.470246734397678e-05, + "loss": 0.0064, + "step": 5890 + }, + { + "epoch": 1.5544789400830532, + "grad_norm": 0.46102187037467957, + "learning_rate": 4.457052381580683e-05, + "loss": 0.0115, + "step": 5895 + }, + { + "epoch": 1.5557972447432602, + "grad_norm": 0.0689118504524231, + "learning_rate": 4.4438580287636895e-05, + "loss": 0.0334, + "step": 5900 + }, + { + "epoch": 1.5571155494034672, + "grad_norm": 0.003091114340350032, + "learning_rate": 4.430663675946695e-05, + "loss": 0.0246, + "step": 5905 + }, + { + "epoch": 1.558433854063674, + "grad_norm": 0.003877349430695176, + "learning_rate": 4.417469323129701e-05, + "loss": 0.0032, + "step": 5910 + }, + { + "epoch": 1.5597521587238812, + "grad_norm": 0.30713143944740295, + "learning_rate": 4.404274970312706e-05, + "loss": 0.0229, + "step": 5915 + }, + { + "epoch": 1.561070463384088, + "grad_norm": 0.07344445586204529, + "learning_rate": 4.391080617495712e-05, + "loss": 0.0078, + "step": 5920 + }, + { + "epoch": 1.5623887680442952, + "grad_norm": 0.01774723082780838, + "learning_rate": 4.377886264678718e-05, + "loss": 0.0034, + "step": 5925 + }, + { + "epoch": 1.563707072704502, + "grad_norm": 0.476324200630188, + "learning_rate": 4.3646919118617236e-05, + "loss": 0.0071, + "step": 5930 + }, + { + "epoch": 1.5650253773647091, + "grad_norm": 0.11624465882778168, + "learning_rate": 4.351497559044729e-05, + "loss": 0.0236, + "step": 5935 + }, + { + "epoch": 1.566343682024916, + "grad_norm": 0.190691277384758, + "learning_rate": 4.338303206227735e-05, + "loss": 0.006, + "step": 5940 + }, + { + "epoch": 1.567661986685123, + "grad_norm": 0.20517045259475708, + "learning_rate": 4.32510885341074e-05, + "loss": 0.009, + "step": 5945 + }, + { + "epoch": 1.56898029134533, + "grad_norm": 0.008122317492961884, + "learning_rate": 4.311914500593746e-05, + "loss": 0.0041, + "step": 5950 + }, + { + "epoch": 1.570298596005537, + "grad_norm": 0.01982291042804718, + "learning_rate": 4.298720147776752e-05, + "loss": 0.0258, + "step": 5955 + }, + { + "epoch": 1.5716169006657439, + "grad_norm": 0.000996922142803669, + "learning_rate": 4.285525794959758e-05, + "loss": 0.0233, + "step": 5960 + }, + { + "epoch": 1.5729352053259509, + "grad_norm": 0.09725592285394669, + "learning_rate": 4.272331442142763e-05, + "loss": 0.0218, + "step": 5965 + }, + { + "epoch": 1.5742535099861579, + "grad_norm": 0.0672350749373436, + "learning_rate": 4.259137089325769e-05, + "loss": 0.0194, + "step": 5970 + }, + { + "epoch": 1.5755718146463646, + "grad_norm": 0.014844833873212337, + "learning_rate": 4.2459427365087744e-05, + "loss": 0.0298, + "step": 5975 + }, + { + "epoch": 1.5768901193065719, + "grad_norm": 0.030519040301442146, + "learning_rate": 4.2327483836917806e-05, + "loss": 0.0178, + "step": 5980 + }, + { + "epoch": 1.5782084239667786, + "grad_norm": 0.018561460077762604, + "learning_rate": 4.219554030874786e-05, + "loss": 0.0154, + "step": 5985 + }, + { + "epoch": 1.5795267286269858, + "grad_norm": 0.02470085583627224, + "learning_rate": 4.206359678057791e-05, + "loss": 0.0361, + "step": 5990 + }, + { + "epoch": 1.5808450332871926, + "grad_norm": 0.055412422865629196, + "learning_rate": 4.193165325240797e-05, + "loss": 0.0162, + "step": 5995 + }, + { + "epoch": 1.5821633379473996, + "grad_norm": 0.0034158769994974136, + "learning_rate": 4.179970972423803e-05, + "loss": 0.0068, + "step": 6000 + }, + { + "epoch": 1.5821633379473996, + "eval_loss": 0.024797894060611725, + "eval_runtime": 452.1611, + "eval_samples_per_second": 7.458, + "eval_steps_per_second": 3.729, + "step": 6000 + }, + { + "epoch": 1.5834816426076066, + "grad_norm": 0.01284120511263609, + "learning_rate": 4.1667766196068085e-05, + "loss": 0.0036, + "step": 6005 + }, + { + "epoch": 1.5847999472678136, + "grad_norm": 0.01274865586310625, + "learning_rate": 4.153582266789815e-05, + "loss": 0.0447, + "step": 6010 + }, + { + "epoch": 1.5861182519280206, + "grad_norm": 0.03555435314774513, + "learning_rate": 4.1403879139728196e-05, + "loss": 0.0078, + "step": 6015 + }, + { + "epoch": 1.5874365565882276, + "grad_norm": 0.0011938117677345872, + "learning_rate": 4.127193561155825e-05, + "loss": 0.0136, + "step": 6020 + }, + { + "epoch": 1.5887548612484346, + "grad_norm": 0.9741255640983582, + "learning_rate": 4.1139992083388314e-05, + "loss": 0.0153, + "step": 6025 + }, + { + "epoch": 1.5900731659086413, + "grad_norm": 0.011220674030482769, + "learning_rate": 4.100804855521837e-05, + "loss": 0.0262, + "step": 6030 + }, + { + "epoch": 1.5913914705688486, + "grad_norm": 0.021556466817855835, + "learning_rate": 4.0876105027048425e-05, + "loss": 0.0044, + "step": 6035 + }, + { + "epoch": 1.5927097752290553, + "grad_norm": 0.2725502848625183, + "learning_rate": 4.074416149887848e-05, + "loss": 0.0558, + "step": 6040 + }, + { + "epoch": 1.5940280798892625, + "grad_norm": 0.6407182216644287, + "learning_rate": 4.0612217970708537e-05, + "loss": 0.0261, + "step": 6045 + }, + { + "epoch": 1.5953463845494693, + "grad_norm": 0.0024960115551948547, + "learning_rate": 4.04802744425386e-05, + "loss": 0.0128, + "step": 6050 + }, + { + "epoch": 1.5966646892096763, + "grad_norm": 0.11380109190940857, + "learning_rate": 4.0348330914368655e-05, + "loss": 0.0199, + "step": 6055 + }, + { + "epoch": 1.5979829938698833, + "grad_norm": 0.18358005583286285, + "learning_rate": 4.0216387386198704e-05, + "loss": 0.0083, + "step": 6060 + }, + { + "epoch": 1.5993012985300903, + "grad_norm": 0.06412303447723389, + "learning_rate": 4.0084443858028766e-05, + "loss": 0.0548, + "step": 6065 + }, + { + "epoch": 1.6006196031902973, + "grad_norm": 0.6999421119689941, + "learning_rate": 3.995250032985882e-05, + "loss": 0.0074, + "step": 6070 + }, + { + "epoch": 1.6019379078505043, + "grad_norm": 0.18698133528232574, + "learning_rate": 3.982055680168888e-05, + "loss": 0.0542, + "step": 6075 + }, + { + "epoch": 1.6032562125107113, + "grad_norm": 0.014717207290232182, + "learning_rate": 3.968861327351894e-05, + "loss": 0.0071, + "step": 6080 + }, + { + "epoch": 1.604574517170918, + "grad_norm": 0.0765385851264, + "learning_rate": 3.955666974534899e-05, + "loss": 0.0063, + "step": 6085 + }, + { + "epoch": 1.6058928218311253, + "grad_norm": 0.4332450330257416, + "learning_rate": 3.9424726217179044e-05, + "loss": 0.0071, + "step": 6090 + }, + { + "epoch": 1.607211126491332, + "grad_norm": 0.003700035158544779, + "learning_rate": 3.929278268900911e-05, + "loss": 0.0052, + "step": 6095 + }, + { + "epoch": 1.6085294311515392, + "grad_norm": 0.02500278130173683, + "learning_rate": 3.916083916083916e-05, + "loss": 0.0387, + "step": 6100 + }, + { + "epoch": 1.609847735811746, + "grad_norm": 0.023568281903862953, + "learning_rate": 3.902889563266922e-05, + "loss": 0.0594, + "step": 6105 + }, + { + "epoch": 1.6111660404719532, + "grad_norm": 0.02687825821340084, + "learning_rate": 3.8896952104499274e-05, + "loss": 0.0229, + "step": 6110 + }, + { + "epoch": 1.61248434513216, + "grad_norm": 0.005178579594939947, + "learning_rate": 3.876500857632933e-05, + "loss": 0.0293, + "step": 6115 + }, + { + "epoch": 1.613802649792367, + "grad_norm": 0.3987988531589508, + "learning_rate": 3.863306504815939e-05, + "loss": 0.015, + "step": 6120 + }, + { + "epoch": 1.615120954452574, + "grad_norm": 0.18915466964244843, + "learning_rate": 3.850112151998945e-05, + "loss": 0.023, + "step": 6125 + }, + { + "epoch": 1.616439259112781, + "grad_norm": 0.015252528712153435, + "learning_rate": 3.83691779918195e-05, + "loss": 0.0185, + "step": 6130 + }, + { + "epoch": 1.617757563772988, + "grad_norm": 0.04947187379002571, + "learning_rate": 3.823723446364956e-05, + "loss": 0.0131, + "step": 6135 + }, + { + "epoch": 1.619075868433195, + "grad_norm": 0.017095958814024925, + "learning_rate": 3.8105290935479615e-05, + "loss": 0.0071, + "step": 6140 + }, + { + "epoch": 1.620394173093402, + "grad_norm": 0.013050337322056293, + "learning_rate": 3.797334740730967e-05, + "loss": 0.0038, + "step": 6145 + }, + { + "epoch": 1.6217124777536087, + "grad_norm": 0.08132806420326233, + "learning_rate": 3.784140387913973e-05, + "loss": 0.0043, + "step": 6150 + }, + { + "epoch": 1.623030782413816, + "grad_norm": 0.020741304382681847, + "learning_rate": 3.770946035096979e-05, + "loss": 0.006, + "step": 6155 + }, + { + "epoch": 1.6243490870740227, + "grad_norm": 0.0576217919588089, + "learning_rate": 3.7577516822799844e-05, + "loss": 0.0033, + "step": 6160 + }, + { + "epoch": 1.62566739173423, + "grad_norm": 0.03032900020480156, + "learning_rate": 3.74455732946299e-05, + "loss": 0.0318, + "step": 6165 + }, + { + "epoch": 1.6269856963944367, + "grad_norm": 0.8868799209594727, + "learning_rate": 3.7313629766459955e-05, + "loss": 0.0304, + "step": 6170 + }, + { + "epoch": 1.6283040010546437, + "grad_norm": 0.003816834883764386, + "learning_rate": 3.718168623829002e-05, + "loss": 0.003, + "step": 6175 + }, + { + "epoch": 1.6296223057148507, + "grad_norm": 0.05368296429514885, + "learning_rate": 3.704974271012007e-05, + "loss": 0.0064, + "step": 6180 + }, + { + "epoch": 1.6309406103750577, + "grad_norm": 0.09963366389274597, + "learning_rate": 3.691779918195012e-05, + "loss": 0.0097, + "step": 6185 + }, + { + "epoch": 1.6322589150352647, + "grad_norm": 0.006273225415498018, + "learning_rate": 3.6785855653780185e-05, + "loss": 0.0071, + "step": 6190 + }, + { + "epoch": 1.6335772196954716, + "grad_norm": 0.15079188346862793, + "learning_rate": 3.665391212561024e-05, + "loss": 0.0058, + "step": 6195 + }, + { + "epoch": 1.6348955243556786, + "grad_norm": 0.004980973433703184, + "learning_rate": 3.6521968597440296e-05, + "loss": 0.0051, + "step": 6200 + }, + { + "epoch": 1.6362138290158854, + "grad_norm": 0.004235363099724054, + "learning_rate": 3.639002506927036e-05, + "loss": 0.0028, + "step": 6205 + }, + { + "epoch": 1.6375321336760926, + "grad_norm": 0.003829963505268097, + "learning_rate": 3.625808154110041e-05, + "loss": 0.0347, + "step": 6210 + }, + { + "epoch": 1.6388504383362994, + "grad_norm": 0.021650686860084534, + "learning_rate": 3.612613801293046e-05, + "loss": 0.0036, + "step": 6215 + }, + { + "epoch": 1.6401687429965066, + "grad_norm": 0.06326934695243835, + "learning_rate": 3.5994194484760525e-05, + "loss": 0.0228, + "step": 6220 + }, + { + "epoch": 1.6414870476567134, + "grad_norm": 0.017276322469115257, + "learning_rate": 3.586225095659058e-05, + "loss": 0.0025, + "step": 6225 + }, + { + "epoch": 1.6428053523169206, + "grad_norm": 0.005066063720732927, + "learning_rate": 3.573030742842064e-05, + "loss": 0.0047, + "step": 6230 + }, + { + "epoch": 1.6441236569771274, + "grad_norm": 0.003512267954647541, + "learning_rate": 3.559836390025069e-05, + "loss": 0.0018, + "step": 6235 + }, + { + "epoch": 1.6454419616373344, + "grad_norm": 0.004347699694335461, + "learning_rate": 3.546642037208075e-05, + "loss": 0.0045, + "step": 6240 + }, + { + "epoch": 1.6467602662975414, + "grad_norm": 0.008277533575892448, + "learning_rate": 3.533447684391081e-05, + "loss": 0.0456, + "step": 6245 + }, + { + "epoch": 1.6480785709577483, + "grad_norm": 0.00973033718764782, + "learning_rate": 3.5202533315740866e-05, + "loss": 0.0215, + "step": 6250 + }, + { + "epoch": 1.6493968756179553, + "grad_norm": 1.9432978630065918, + "learning_rate": 3.507058978757092e-05, + "loss": 0.0132, + "step": 6255 + }, + { + "epoch": 1.6507151802781623, + "grad_norm": 0.2693535387516022, + "learning_rate": 3.493864625940098e-05, + "loss": 0.0037, + "step": 6260 + }, + { + "epoch": 1.6520334849383693, + "grad_norm": 0.02107766456902027, + "learning_rate": 3.480670273123103e-05, + "loss": 0.0031, + "step": 6265 + }, + { + "epoch": 1.653351789598576, + "grad_norm": 0.07168436795473099, + "learning_rate": 3.467475920306109e-05, + "loss": 0.0101, + "step": 6270 + }, + { + "epoch": 1.6546700942587833, + "grad_norm": 0.06479799002408981, + "learning_rate": 3.454281567489115e-05, + "loss": 0.0032, + "step": 6275 + }, + { + "epoch": 1.65598839891899, + "grad_norm": 0.0013557536294683814, + "learning_rate": 3.441087214672121e-05, + "loss": 0.0037, + "step": 6280 + }, + { + "epoch": 1.6573067035791973, + "grad_norm": 0.07330150157213211, + "learning_rate": 3.427892861855126e-05, + "loss": 0.0031, + "step": 6285 + }, + { + "epoch": 1.658625008239404, + "grad_norm": 0.08246012777090073, + "learning_rate": 3.414698509038132e-05, + "loss": 0.0028, + "step": 6290 + }, + { + "epoch": 1.659943312899611, + "grad_norm": 0.6232367157936096, + "learning_rate": 3.4015041562211374e-05, + "loss": 0.0042, + "step": 6295 + }, + { + "epoch": 1.661261617559818, + "grad_norm": 0.007676729932427406, + "learning_rate": 3.388309803404143e-05, + "loss": 0.0501, + "step": 6300 + }, + { + "epoch": 1.662579922220025, + "grad_norm": 0.02081216312944889, + "learning_rate": 3.375115450587149e-05, + "loss": 0.0047, + "step": 6305 + }, + { + "epoch": 1.663898226880232, + "grad_norm": 0.008829087018966675, + "learning_rate": 3.361921097770154e-05, + "loss": 0.0298, + "step": 6310 + }, + { + "epoch": 1.665216531540439, + "grad_norm": 0.4426127076148987, + "learning_rate": 3.34872674495316e-05, + "loss": 0.0045, + "step": 6315 + }, + { + "epoch": 1.666534836200646, + "grad_norm": 0.025818035006523132, + "learning_rate": 3.335532392136166e-05, + "loss": 0.0028, + "step": 6320 + }, + { + "epoch": 1.6678531408608528, + "grad_norm": 0.6068133115768433, + "learning_rate": 3.3223380393191715e-05, + "loss": 0.0202, + "step": 6325 + }, + { + "epoch": 1.66917144552106, + "grad_norm": 0.02740122564136982, + "learning_rate": 3.309143686502178e-05, + "loss": 0.0025, + "step": 6330 + }, + { + "epoch": 1.6704897501812668, + "grad_norm": 0.15878735482692719, + "learning_rate": 3.2959493336851826e-05, + "loss": 0.004, + "step": 6335 + }, + { + "epoch": 1.671808054841474, + "grad_norm": 0.006827466655522585, + "learning_rate": 3.282754980868188e-05, + "loss": 0.0048, + "step": 6340 + }, + { + "epoch": 1.6731263595016808, + "grad_norm": 0.19508551061153412, + "learning_rate": 3.2695606280511944e-05, + "loss": 0.0025, + "step": 6345 + }, + { + "epoch": 1.674444664161888, + "grad_norm": 0.8176754713058472, + "learning_rate": 3.2563662752342e-05, + "loss": 0.0151, + "step": 6350 + }, + { + "epoch": 1.6757629688220947, + "grad_norm": 0.011672024615108967, + "learning_rate": 3.2431719224172055e-05, + "loss": 0.0452, + "step": 6355 + }, + { + "epoch": 1.6770812734823017, + "grad_norm": 0.015824951231479645, + "learning_rate": 3.229977569600211e-05, + "loss": 0.0236, + "step": 6360 + }, + { + "epoch": 1.6783995781425087, + "grad_norm": 0.1358737051486969, + "learning_rate": 3.216783216783217e-05, + "loss": 0.0078, + "step": 6365 + }, + { + "epoch": 1.6797178828027157, + "grad_norm": 0.004896901547908783, + "learning_rate": 3.203588863966223e-05, + "loss": 0.0042, + "step": 6370 + }, + { + "epoch": 1.6810361874629227, + "grad_norm": 0.22593103349208832, + "learning_rate": 3.1903945111492285e-05, + "loss": 0.0053, + "step": 6375 + }, + { + "epoch": 1.6823544921231297, + "grad_norm": 0.0073196059092879295, + "learning_rate": 3.177200158332234e-05, + "loss": 0.0287, + "step": 6380 + }, + { + "epoch": 1.6836727967833367, + "grad_norm": 0.018524926155805588, + "learning_rate": 3.1640058055152396e-05, + "loss": 0.0122, + "step": 6385 + }, + { + "epoch": 1.6849911014435435, + "grad_norm": 0.7453815937042236, + "learning_rate": 3.150811452698245e-05, + "loss": 0.0378, + "step": 6390 + }, + { + "epoch": 1.6863094061037507, + "grad_norm": 0.22409795224666595, + "learning_rate": 3.137617099881251e-05, + "loss": 0.0282, + "step": 6395 + }, + { + "epoch": 1.6876277107639575, + "grad_norm": 0.005432693753391504, + "learning_rate": 3.124422747064257e-05, + "loss": 0.0162, + "step": 6400 + }, + { + "epoch": 1.6889460154241647, + "grad_norm": 0.1493055820465088, + "learning_rate": 3.1112283942472626e-05, + "loss": 0.0123, + "step": 6405 + }, + { + "epoch": 1.6902643200843714, + "grad_norm": 0.1638440042734146, + "learning_rate": 3.0980340414302674e-05, + "loss": 0.0058, + "step": 6410 + }, + { + "epoch": 1.6915826247445784, + "grad_norm": 0.015779908746480942, + "learning_rate": 3.084839688613274e-05, + "loss": 0.0157, + "step": 6415 + }, + { + "epoch": 1.6929009294047854, + "grad_norm": 0.0012348912423476577, + "learning_rate": 3.071645335796279e-05, + "loss": 0.0016, + "step": 6420 + }, + { + "epoch": 1.6942192340649924, + "grad_norm": 0.05294624716043472, + "learning_rate": 3.058450982979285e-05, + "loss": 0.0037, + "step": 6425 + }, + { + "epoch": 1.6955375387251994, + "grad_norm": 0.01926981844007969, + "learning_rate": 3.045256630162291e-05, + "loss": 0.0053, + "step": 6430 + }, + { + "epoch": 1.6968558433854064, + "grad_norm": 0.005958891473710537, + "learning_rate": 3.0320622773452963e-05, + "loss": 0.0025, + "step": 6435 + }, + { + "epoch": 1.6981741480456134, + "grad_norm": 0.001902201445773244, + "learning_rate": 3.018867924528302e-05, + "loss": 0.0027, + "step": 6440 + }, + { + "epoch": 1.6994924527058202, + "grad_norm": 0.036614127457141876, + "learning_rate": 3.0056735717113078e-05, + "loss": 0.0026, + "step": 6445 + }, + { + "epoch": 1.7008107573660274, + "grad_norm": 0.07294526696205139, + "learning_rate": 2.9924792188943133e-05, + "loss": 0.0042, + "step": 6450 + }, + { + "epoch": 1.7021290620262342, + "grad_norm": 0.42822372913360596, + "learning_rate": 2.9792848660773192e-05, + "loss": 0.013, + "step": 6455 + }, + { + "epoch": 1.7034473666864414, + "grad_norm": 0.036622967571020126, + "learning_rate": 2.9660905132603245e-05, + "loss": 0.0029, + "step": 6460 + }, + { + "epoch": 1.7047656713466481, + "grad_norm": 0.08314034342765808, + "learning_rate": 2.9528961604433304e-05, + "loss": 0.0043, + "step": 6465 + }, + { + "epoch": 1.7060839760068551, + "grad_norm": 0.0005654952838085592, + "learning_rate": 2.939701807626336e-05, + "loss": 0.0595, + "step": 6470 + }, + { + "epoch": 1.7074022806670621, + "grad_norm": 0.004545385017991066, + "learning_rate": 2.926507454809342e-05, + "loss": 0.0044, + "step": 6475 + }, + { + "epoch": 1.7087205853272691, + "grad_norm": 0.00033831383916549385, + "learning_rate": 2.9133131019923477e-05, + "loss": 0.0046, + "step": 6480 + }, + { + "epoch": 1.710038889987476, + "grad_norm": 0.0019903562497347593, + "learning_rate": 2.900118749175353e-05, + "loss": 0.0026, + "step": 6485 + }, + { + "epoch": 1.711357194647683, + "grad_norm": 0.10188104957342148, + "learning_rate": 2.8869243963583585e-05, + "loss": 0.0069, + "step": 6490 + }, + { + "epoch": 1.71267549930789, + "grad_norm": 0.2123432606458664, + "learning_rate": 2.8737300435413644e-05, + "loss": 0.0199, + "step": 6495 + }, + { + "epoch": 1.7139938039680969, + "grad_norm": 0.43209517002105713, + "learning_rate": 2.8605356907243703e-05, + "loss": 0.0099, + "step": 6500 + }, + { + "epoch": 1.7139938039680969, + "eval_loss": 0.024327505379915237, + "eval_runtime": 452.0052, + "eval_samples_per_second": 7.46, + "eval_steps_per_second": 3.73, + "step": 6500 + } + ], + "logging_steps": 5, + "max_steps": 7584, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.246198741313147e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-6500/training_args.bin b/checkpoint-6500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4c05e0585b1aef03c0cbe4c50207ce04940bd838 --- /dev/null +++ b/checkpoint-6500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79dfa687fdd0c9908ab6b63535817e7567b29b0b483ac228723218f6f5fdeec5 +size 5688 diff --git a/checkpoint-7000/README.md b/checkpoint-7000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e738b5eda9883f4a45efd9d37b8b8357ba758456 --- /dev/null +++ b/checkpoint-7000/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/Llama-3.2-3B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-7000/adapter_config.json b/checkpoint-7000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..852d02d4dc1204f4332be8b3363041a3d3e3feb3 --- /dev/null +++ b/checkpoint-7000/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/Llama-3.2-3B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "gate_proj", + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-7000/adapter_model.safetensors b/checkpoint-7000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0bce0eea49c959b952de64929368dc48654fe22a --- /dev/null +++ b/checkpoint-7000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b48019b11bbca99007836082a576544ac6119d26881761ff49993554f0373185 +size 97307544 diff --git a/checkpoint-7000/optimizer.pt b/checkpoint-7000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..33433f17e5becce14706250f0b1cf69632d1be35 --- /dev/null +++ b/checkpoint-7000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:027751c323fdc40bd700a701e569a7c75e2a9a74d317ea4d3766c92a794d490c +size 50866370 diff --git a/checkpoint-7000/rng_state.pth b/checkpoint-7000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9ef3861a89658e598642b74a3479391e2cedb91a --- /dev/null +++ b/checkpoint-7000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:470b36955468369ff24a31f177aac32d2d3f3dbbf0bae909b882d4b5a86b9cbe +size 14244 diff --git a/checkpoint-7000/scheduler.pt b/checkpoint-7000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb8cc5fdc2edfffe1f97fd498e5bae8b36b9a0dd --- /dev/null +++ b/checkpoint-7000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be90db71b32c4ce3bcb6788246301e28306ec75a1716a18be02fdd9812f59de2 +size 1064 diff --git a/checkpoint-7000/special_tokens_map.json b/checkpoint-7000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-7000/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-7000/tokenizer.json b/checkpoint-7000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-7000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-7000/tokenizer_config.json b/checkpoint-7000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfd835b0ef1033e89629af6e13ae45397e9fa2f8 --- /dev/null +++ b/checkpoint-7000/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 July 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-7000/trainer_state.json b/checkpoint-7000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5cf915611a4a7bd311ce3f0eb10f52c335562d7d --- /dev/null +++ b/checkpoint-7000/trainer_state.json @@ -0,0 +1,9948 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.8458242699887943, + "eval_steps": 500, + "global_step": 7000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001318304660206974, + "grad_norm": 4.59375, + "learning_rate": 0.0002, + "loss": 1.9624, + "step": 5 + }, + { + "epoch": 0.002636609320413948, + "grad_norm": 1.7421875, + "learning_rate": 0.00019986805647183008, + "loss": 0.6513, + "step": 10 + }, + { + "epoch": 0.003954913980620921, + "grad_norm": 1.84375, + "learning_rate": 0.00019973611294366012, + "loss": 0.1146, + "step": 15 + }, + { + "epoch": 0.005273218640827896, + "grad_norm": 1.3203125, + "learning_rate": 0.0001996041694154902, + "loss": 0.0529, + "step": 20 + }, + { + "epoch": 0.006591523301034869, + "grad_norm": 0.40234375, + "learning_rate": 0.00019947222588732023, + "loss": 0.1214, + "step": 25 + }, + { + "epoch": 0.007909827961241843, + "grad_norm": 1.5390625, + "learning_rate": 0.0001993402823591503, + "loss": 0.0919, + "step": 30 + }, + { + "epoch": 0.009228132621448816, + "grad_norm": 0.06201171875, + "learning_rate": 0.00019920833883098034, + "loss": 0.09, + "step": 35 + }, + { + "epoch": 0.010546437281655791, + "grad_norm": 1.53125, + "learning_rate": 0.0001990763953028104, + "loss": 0.1945, + "step": 40 + }, + { + "epoch": 0.011864741941862765, + "grad_norm": 0.2890625, + "learning_rate": 0.00019894445177464048, + "loss": 0.1259, + "step": 45 + }, + { + "epoch": 0.013183046602069738, + "grad_norm": 0.609375, + "learning_rate": 0.00019881250824647052, + "loss": 0.027, + "step": 50 + }, + { + "epoch": 0.014501351262276712, + "grad_norm": 0.369140625, + "learning_rate": 0.00019868056471830057, + "loss": 0.1068, + "step": 55 + }, + { + "epoch": 0.015819655922483685, + "grad_norm": 0.34765625, + "learning_rate": 0.00019854862119013064, + "loss": 0.0542, + "step": 60 + }, + { + "epoch": 0.01713796058269066, + "grad_norm": 0.055419921875, + "learning_rate": 0.00019841667766196068, + "loss": 0.0901, + "step": 65 + }, + { + "epoch": 0.018456265242897632, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00019828473413379075, + "loss": 0.0091, + "step": 70 + }, + { + "epoch": 0.019774569903104607, + "grad_norm": 0.0079345703125, + "learning_rate": 0.0001981527906056208, + "loss": 0.0744, + "step": 75 + }, + { + "epoch": 0.021092874563311582, + "grad_norm": 0.65234375, + "learning_rate": 0.00019802084707745086, + "loss": 0.1108, + "step": 80 + }, + { + "epoch": 0.022411179223518554, + "grad_norm": 0.50390625, + "learning_rate": 0.0001978889035492809, + "loss": 0.0446, + "step": 85 + }, + { + "epoch": 0.02372948388372553, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019775696002111097, + "loss": 0.0982, + "step": 90 + }, + { + "epoch": 0.0250477885439325, + "grad_norm": 0.490234375, + "learning_rate": 0.00019762501649294104, + "loss": 0.1035, + "step": 95 + }, + { + "epoch": 0.026366093204139476, + "grad_norm": 0.12158203125, + "learning_rate": 0.00019749307296477108, + "loss": 0.0401, + "step": 100 + }, + { + "epoch": 0.02768439786434645, + "grad_norm": 0.16015625, + "learning_rate": 0.00019736112943660115, + "loss": 0.0309, + "step": 105 + }, + { + "epoch": 0.029002702524553423, + "grad_norm": 1.359375, + "learning_rate": 0.0001972291859084312, + "loss": 0.1032, + "step": 110 + }, + { + "epoch": 0.0303210071847604, + "grad_norm": 0.52734375, + "learning_rate": 0.00019709724238026126, + "loss": 0.0811, + "step": 115 + }, + { + "epoch": 0.03163931184496737, + "grad_norm": 0.177734375, + "learning_rate": 0.00019696529885209133, + "loss": 0.0258, + "step": 120 + }, + { + "epoch": 0.03295761650517435, + "grad_norm": 0.234375, + "learning_rate": 0.00019683335532392137, + "loss": 0.0437, + "step": 125 + }, + { + "epoch": 0.03427592116538132, + "grad_norm": 1.3046875, + "learning_rate": 0.00019670141179575144, + "loss": 0.0967, + "step": 130 + }, + { + "epoch": 0.03559422582558829, + "grad_norm": 0.2734375, + "learning_rate": 0.00019656946826758148, + "loss": 0.0132, + "step": 135 + }, + { + "epoch": 0.036912530485795264, + "grad_norm": 0.66015625, + "learning_rate": 0.00019643752473941155, + "loss": 0.0396, + "step": 140 + }, + { + "epoch": 0.03823083514600224, + "grad_norm": 1.0546875, + "learning_rate": 0.0001963055812112416, + "loss": 0.0449, + "step": 145 + }, + { + "epoch": 0.039549139806209214, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019617363768307166, + "loss": 0.1196, + "step": 150 + }, + { + "epoch": 0.040867444466416186, + "grad_norm": 0.5859375, + "learning_rate": 0.0001960416941549017, + "loss": 0.0588, + "step": 155 + }, + { + "epoch": 0.042185749126623165, + "grad_norm": 0.06005859375, + "learning_rate": 0.00019590975062673175, + "loss": 0.0234, + "step": 160 + }, + { + "epoch": 0.04350405378683014, + "grad_norm": 0.4921875, + "learning_rate": 0.00019577780709856182, + "loss": 0.0916, + "step": 165 + }, + { + "epoch": 0.04482235844703711, + "grad_norm": 0.84375, + "learning_rate": 0.0001956458635703919, + "loss": 0.0271, + "step": 170 + }, + { + "epoch": 0.04614066310724409, + "grad_norm": 0.8828125, + "learning_rate": 0.00019551392004222193, + "loss": 0.0175, + "step": 175 + }, + { + "epoch": 0.04745896776745106, + "grad_norm": 0.0152587890625, + "learning_rate": 0.000195381976514052, + "loss": 0.0356, + "step": 180 + }, + { + "epoch": 0.04877727242765803, + "grad_norm": 0.09326171875, + "learning_rate": 0.00019525003298588204, + "loss": 0.0057, + "step": 185 + }, + { + "epoch": 0.050095577087865, + "grad_norm": 0.24609375, + "learning_rate": 0.0001951180894577121, + "loss": 0.0082, + "step": 190 + }, + { + "epoch": 0.05141388174807198, + "grad_norm": 0.05029296875, + "learning_rate": 0.00019498614592954215, + "loss": 0.0178, + "step": 195 + }, + { + "epoch": 0.05273218640827895, + "grad_norm": 0.0390625, + "learning_rate": 0.00019485420240137222, + "loss": 0.0789, + "step": 200 + }, + { + "epoch": 0.054050491068485924, + "grad_norm": 0.5625, + "learning_rate": 0.0001947222588732023, + "loss": 0.0645, + "step": 205 + }, + { + "epoch": 0.0553687957286929, + "grad_norm": 0.53515625, + "learning_rate": 0.00019459031534503233, + "loss": 0.116, + "step": 210 + }, + { + "epoch": 0.056687100388899875, + "grad_norm": 0.55078125, + "learning_rate": 0.0001944583718168624, + "loss": 0.0516, + "step": 215 + }, + { + "epoch": 0.058005405049106847, + "grad_norm": 0.314453125, + "learning_rate": 0.00019432642828869244, + "loss": 0.1019, + "step": 220 + }, + { + "epoch": 0.059323709709313825, + "grad_norm": 0.1123046875, + "learning_rate": 0.0001941944847605225, + "loss": 0.0529, + "step": 225 + }, + { + "epoch": 0.0606420143695208, + "grad_norm": 0.4921875, + "learning_rate": 0.00019406254123235256, + "loss": 0.0368, + "step": 230 + }, + { + "epoch": 0.06196031902972777, + "grad_norm": 0.054443359375, + "learning_rate": 0.00019393059770418262, + "loss": 0.037, + "step": 235 + }, + { + "epoch": 0.06327862368993474, + "grad_norm": 0.008544921875, + "learning_rate": 0.0001937986541760127, + "loss": 0.0324, + "step": 240 + }, + { + "epoch": 0.06459692835014172, + "grad_norm": 1.5, + "learning_rate": 0.00019366671064784274, + "loss": 0.0334, + "step": 245 + }, + { + "epoch": 0.0659152330103487, + "grad_norm": 0.2109375, + "learning_rate": 0.0001935347671196728, + "loss": 0.0671, + "step": 250 + }, + { + "epoch": 0.06723353767055566, + "grad_norm": 2.0625, + "learning_rate": 0.00019340282359150285, + "loss": 0.1559, + "step": 255 + }, + { + "epoch": 0.06855184233076264, + "grad_norm": 0.7734375, + "learning_rate": 0.0001932708800633329, + "loss": 0.0198, + "step": 260 + }, + { + "epoch": 0.06987014699096962, + "grad_norm": 0.42578125, + "learning_rate": 0.00019313893653516296, + "loss": 0.0151, + "step": 265 + }, + { + "epoch": 0.07118845165117658, + "grad_norm": 0.1884765625, + "learning_rate": 0.000193006993006993, + "loss": 0.0269, + "step": 270 + }, + { + "epoch": 0.07250675631138356, + "grad_norm": 1.546875, + "learning_rate": 0.00019287504947882307, + "loss": 0.0565, + "step": 275 + }, + { + "epoch": 0.07382506097159053, + "grad_norm": 0.5078125, + "learning_rate": 0.0001927431059506531, + "loss": 0.0942, + "step": 280 + }, + { + "epoch": 0.0751433656317975, + "grad_norm": 0.392578125, + "learning_rate": 0.00019261116242248318, + "loss": 0.0061, + "step": 285 + }, + { + "epoch": 0.07646167029200449, + "grad_norm": 1.9140625, + "learning_rate": 0.00019247921889431325, + "loss": 0.0497, + "step": 290 + }, + { + "epoch": 0.07777997495221145, + "grad_norm": 0.08837890625, + "learning_rate": 0.0001923472753661433, + "loss": 0.0573, + "step": 295 + }, + { + "epoch": 0.07909827961241843, + "grad_norm": 1.046875, + "learning_rate": 0.00019221533183797336, + "loss": 0.0528, + "step": 300 + }, + { + "epoch": 0.08041658427262541, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001920833883098034, + "loss": 0.0506, + "step": 305 + }, + { + "epoch": 0.08173488893283237, + "grad_norm": 0.08203125, + "learning_rate": 0.00019195144478163347, + "loss": 0.0307, + "step": 310 + }, + { + "epoch": 0.08305319359303935, + "grad_norm": 0.111328125, + "learning_rate": 0.00019181950125346354, + "loss": 0.0365, + "step": 315 + }, + { + "epoch": 0.08437149825324633, + "grad_norm": 1.2890625, + "learning_rate": 0.00019168755772529358, + "loss": 0.0447, + "step": 320 + }, + { + "epoch": 0.0856898029134533, + "grad_norm": 0.6015625, + "learning_rate": 0.00019155561419712365, + "loss": 0.0605, + "step": 325 + }, + { + "epoch": 0.08700810757366027, + "grad_norm": 0.71875, + "learning_rate": 0.0001914236706689537, + "loss": 0.0846, + "step": 330 + }, + { + "epoch": 0.08832641223386725, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019129172714078376, + "loss": 0.0713, + "step": 335 + }, + { + "epoch": 0.08964471689407422, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001911597836126138, + "loss": 0.0826, + "step": 340 + }, + { + "epoch": 0.0909630215542812, + "grad_norm": 2.203125, + "learning_rate": 0.00019102784008444388, + "loss": 0.0441, + "step": 345 + }, + { + "epoch": 0.09228132621448817, + "grad_norm": 1.21875, + "learning_rate": 0.00019089589655627395, + "loss": 0.1378, + "step": 350 + }, + { + "epoch": 0.09359963087469514, + "grad_norm": 3.0625, + "learning_rate": 0.00019076395302810396, + "loss": 0.1552, + "step": 355 + }, + { + "epoch": 0.09491793553490212, + "grad_norm": 0.232421875, + "learning_rate": 0.00019063200949993403, + "loss": 0.0458, + "step": 360 + }, + { + "epoch": 0.0962362401951091, + "grad_norm": 0.71875, + "learning_rate": 0.0001905000659717641, + "loss": 0.0312, + "step": 365 + }, + { + "epoch": 0.09755454485531606, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00019036812244359414, + "loss": 0.0247, + "step": 370 + }, + { + "epoch": 0.09887284951552304, + "grad_norm": 0.064453125, + "learning_rate": 0.0001902361789154242, + "loss": 0.054, + "step": 375 + }, + { + "epoch": 0.10019115417573, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019010423538725425, + "loss": 0.0023, + "step": 380 + }, + { + "epoch": 0.10150945883593698, + "grad_norm": 0.0361328125, + "learning_rate": 0.00018997229185908432, + "loss": 0.0884, + "step": 385 + }, + { + "epoch": 0.10282776349614396, + "grad_norm": 1.703125, + "learning_rate": 0.00018984034833091436, + "loss": 0.0506, + "step": 390 + }, + { + "epoch": 0.10414606815635093, + "grad_norm": 0.08837890625, + "learning_rate": 0.00018970840480274443, + "loss": 0.1123, + "step": 395 + }, + { + "epoch": 0.1054643728165579, + "grad_norm": 0.6953125, + "learning_rate": 0.0001895764612745745, + "loss": 0.0597, + "step": 400 + }, + { + "epoch": 0.10678267747676488, + "grad_norm": 0.18359375, + "learning_rate": 0.00018944451774640454, + "loss": 0.0138, + "step": 405 + }, + { + "epoch": 0.10810098213697185, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001893125742182346, + "loss": 0.0249, + "step": 410 + }, + { + "epoch": 0.10941928679717883, + "grad_norm": 0.00970458984375, + "learning_rate": 0.00018918063069006466, + "loss": 0.0084, + "step": 415 + }, + { + "epoch": 0.1107375914573858, + "grad_norm": 0.54296875, + "learning_rate": 0.00018904868716189472, + "loss": 0.0541, + "step": 420 + }, + { + "epoch": 0.11205589611759277, + "grad_norm": 0.74609375, + "learning_rate": 0.00018891674363372477, + "loss": 0.007, + "step": 425 + }, + { + "epoch": 0.11337420077779975, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00018878480010555484, + "loss": 0.0875, + "step": 430 + }, + { + "epoch": 0.11469250543800673, + "grad_norm": 0.9296875, + "learning_rate": 0.0001886528565773849, + "loss": 0.1207, + "step": 435 + }, + { + "epoch": 0.11601081009821369, + "grad_norm": 1.2734375, + "learning_rate": 0.00018852091304921495, + "loss": 0.1143, + "step": 440 + }, + { + "epoch": 0.11732911475842067, + "grad_norm": 0.6484375, + "learning_rate": 0.00018838896952104502, + "loss": 0.0393, + "step": 445 + }, + { + "epoch": 0.11864741941862765, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018825702599287506, + "loss": 0.02, + "step": 450 + }, + { + "epoch": 0.11996572407883462, + "grad_norm": 0.486328125, + "learning_rate": 0.0001881250824647051, + "loss": 0.0891, + "step": 455 + }, + { + "epoch": 0.1212840287390416, + "grad_norm": 1.0, + "learning_rate": 0.00018799313893653517, + "loss": 0.0469, + "step": 460 + }, + { + "epoch": 0.12260233339924857, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001878611954083652, + "loss": 0.019, + "step": 465 + }, + { + "epoch": 0.12392063805945554, + "grad_norm": 0.03857421875, + "learning_rate": 0.00018772925188019528, + "loss": 0.007, + "step": 470 + }, + { + "epoch": 0.12523894271966252, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00018759730835202532, + "loss": 0.0039, + "step": 475 + }, + { + "epoch": 0.12655724737986948, + "grad_norm": 0.014404296875, + "learning_rate": 0.0001874653648238554, + "loss": 0.0043, + "step": 480 + }, + { + "epoch": 0.12787555204007647, + "grad_norm": 0.51953125, + "learning_rate": 0.00018733342129568546, + "loss": 0.1326, + "step": 485 + }, + { + "epoch": 0.12919385670028344, + "grad_norm": 0.99609375, + "learning_rate": 0.0001872014777675155, + "loss": 0.0369, + "step": 490 + }, + { + "epoch": 0.1305121613604904, + "grad_norm": 0.2734375, + "learning_rate": 0.00018706953423934557, + "loss": 0.0395, + "step": 495 + }, + { + "epoch": 0.1318304660206974, + "grad_norm": 0.083984375, + "learning_rate": 0.00018693759071117561, + "loss": 0.0284, + "step": 500 + }, + { + "epoch": 0.1318304660206974, + "eval_loss": 0.04542969539761543, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.5293, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 3.685, + "step": 500 + }, + { + "epoch": 0.13314877068090436, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00018680564718300568, + "loss": 0.0533, + "step": 505 + }, + { + "epoch": 0.13446707534111133, + "grad_norm": 0.71484375, + "learning_rate": 0.00018667370365483575, + "loss": 0.0183, + "step": 510 + }, + { + "epoch": 0.13578538000131832, + "grad_norm": 0.018798828125, + "learning_rate": 0.0001865417601266658, + "loss": 0.0473, + "step": 515 + }, + { + "epoch": 0.13710368466152528, + "grad_norm": 0.388671875, + "learning_rate": 0.00018640981659849586, + "loss": 0.0562, + "step": 520 + }, + { + "epoch": 0.13842198932173225, + "grad_norm": 0.77734375, + "learning_rate": 0.0001862778730703259, + "loss": 0.0755, + "step": 525 + }, + { + "epoch": 0.13974029398193924, + "grad_norm": 2.8125, + "learning_rate": 0.00018614592954215598, + "loss": 0.0422, + "step": 530 + }, + { + "epoch": 0.1410585986421462, + "grad_norm": 0.48828125, + "learning_rate": 0.00018601398601398602, + "loss": 0.0882, + "step": 535 + }, + { + "epoch": 0.14237690330235317, + "grad_norm": 0.16015625, + "learning_rate": 0.0001858820424858161, + "loss": 0.0131, + "step": 540 + }, + { + "epoch": 0.14369520796256013, + "grad_norm": 0.31640625, + "learning_rate": 0.00018575009895764616, + "loss": 0.03, + "step": 545 + }, + { + "epoch": 0.14501351262276713, + "grad_norm": 0.0120849609375, + "learning_rate": 0.0001856181554294762, + "loss": 0.0425, + "step": 550 + }, + { + "epoch": 0.1463318172829741, + "grad_norm": 0.390625, + "learning_rate": 0.00018548621190130624, + "loss": 0.011, + "step": 555 + }, + { + "epoch": 0.14765012194318106, + "grad_norm": 1.9609375, + "learning_rate": 0.0001853542683731363, + "loss": 0.0807, + "step": 560 + }, + { + "epoch": 0.14896842660338805, + "grad_norm": 0.609375, + "learning_rate": 0.00018522232484496635, + "loss": 0.0278, + "step": 565 + }, + { + "epoch": 0.150286731263595, + "grad_norm": 0.087890625, + "learning_rate": 0.00018509038131679642, + "loss": 0.0484, + "step": 570 + }, + { + "epoch": 0.15160503592380198, + "grad_norm": 0.5078125, + "learning_rate": 0.00018495843778862646, + "loss": 0.1277, + "step": 575 + }, + { + "epoch": 0.15292334058400897, + "grad_norm": 0.8125, + "learning_rate": 0.00018482649426045653, + "loss": 0.058, + "step": 580 + }, + { + "epoch": 0.15424164524421594, + "grad_norm": 0.22265625, + "learning_rate": 0.00018469455073228657, + "loss": 0.0259, + "step": 585 + }, + { + "epoch": 0.1555599499044229, + "grad_norm": 1.8984375, + "learning_rate": 0.00018456260720411664, + "loss": 0.113, + "step": 590 + }, + { + "epoch": 0.1568782545646299, + "grad_norm": 0.12451171875, + "learning_rate": 0.0001844306636759467, + "loss": 0.0312, + "step": 595 + }, + { + "epoch": 0.15819655922483686, + "grad_norm": 0.0322265625, + "learning_rate": 0.00018429872014777676, + "loss": 0.0476, + "step": 600 + }, + { + "epoch": 0.15951486388504382, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00018416677661960682, + "loss": 0.0232, + "step": 605 + }, + { + "epoch": 0.16083316854525082, + "grad_norm": 0.57421875, + "learning_rate": 0.00018403483309143687, + "loss": 0.1287, + "step": 610 + }, + { + "epoch": 0.16215147320545778, + "grad_norm": 0.765625, + "learning_rate": 0.00018390288956326694, + "loss": 0.0991, + "step": 615 + }, + { + "epoch": 0.16346977786566474, + "grad_norm": 0.3125, + "learning_rate": 0.00018377094603509698, + "loss": 0.0247, + "step": 620 + }, + { + "epoch": 0.16478808252587174, + "grad_norm": 0.37890625, + "learning_rate": 0.00018363900250692705, + "loss": 0.0632, + "step": 625 + }, + { + "epoch": 0.1661063871860787, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018350705897875712, + "loss": 0.0314, + "step": 630 + }, + { + "epoch": 0.16742469184628567, + "grad_norm": 0.0673828125, + "learning_rate": 0.00018337511545058716, + "loss": 0.0425, + "step": 635 + }, + { + "epoch": 0.16874299650649266, + "grad_norm": 0.396484375, + "learning_rate": 0.00018324317192241723, + "loss": 0.0613, + "step": 640 + }, + { + "epoch": 0.17006130116669962, + "grad_norm": 0.057373046875, + "learning_rate": 0.00018311122839424727, + "loss": 0.0569, + "step": 645 + }, + { + "epoch": 0.1713796058269066, + "grad_norm": 0.001373291015625, + "learning_rate": 0.00018297928486607734, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 0.17269791048711358, + "grad_norm": 1.0859375, + "learning_rate": 0.00018284734133790738, + "loss": 0.0189, + "step": 655 + }, + { + "epoch": 0.17401621514732055, + "grad_norm": 0.6015625, + "learning_rate": 0.00018271539780973742, + "loss": 0.0601, + "step": 660 + }, + { + "epoch": 0.1753345198075275, + "grad_norm": 0.25390625, + "learning_rate": 0.0001825834542815675, + "loss": 0.0211, + "step": 665 + }, + { + "epoch": 0.1766528244677345, + "grad_norm": 2.6875, + "learning_rate": 0.00018245151075339753, + "loss": 0.0713, + "step": 670 + }, + { + "epoch": 0.17797112912794147, + "grad_norm": 1.1875, + "learning_rate": 0.0001823195672252276, + "loss": 0.0522, + "step": 675 + }, + { + "epoch": 0.17928943378814843, + "grad_norm": 0.025146484375, + "learning_rate": 0.00018218762369705767, + "loss": 0.0242, + "step": 680 + }, + { + "epoch": 0.18060773844835543, + "grad_norm": 0.048095703125, + "learning_rate": 0.00018205568016888772, + "loss": 0.0129, + "step": 685 + }, + { + "epoch": 0.1819260431085624, + "grad_norm": 0.04541015625, + "learning_rate": 0.00018192373664071778, + "loss": 0.0142, + "step": 690 + }, + { + "epoch": 0.18324434776876936, + "grad_norm": 0.00830078125, + "learning_rate": 0.00018179179311254783, + "loss": 0.0121, + "step": 695 + }, + { + "epoch": 0.18456265242897635, + "grad_norm": 0.53125, + "learning_rate": 0.0001816598495843779, + "loss": 0.0163, + "step": 700 + }, + { + "epoch": 0.1858809570891833, + "grad_norm": 0.185546875, + "learning_rate": 0.00018152790605620796, + "loss": 0.0203, + "step": 705 + }, + { + "epoch": 0.18719926174939028, + "grad_norm": 1.2578125, + "learning_rate": 0.000181395962528038, + "loss": 0.1548, + "step": 710 + }, + { + "epoch": 0.18851756640959727, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00018126401899986808, + "loss": 0.0543, + "step": 715 + }, + { + "epoch": 0.18983587106980424, + "grad_norm": 0.07568359375, + "learning_rate": 0.00018113207547169812, + "loss": 0.0346, + "step": 720 + }, + { + "epoch": 0.1911541757300112, + "grad_norm": 0.1318359375, + "learning_rate": 0.0001810001319435282, + "loss": 0.03, + "step": 725 + }, + { + "epoch": 0.1924724803902182, + "grad_norm": 0.1455078125, + "learning_rate": 0.00018086818841535823, + "loss": 0.0796, + "step": 730 + }, + { + "epoch": 0.19379078505042516, + "grad_norm": 0.09814453125, + "learning_rate": 0.0001807362448871883, + "loss": 0.0662, + "step": 735 + }, + { + "epoch": 0.19510908971063212, + "grad_norm": 0.91015625, + "learning_rate": 0.00018060430135901837, + "loss": 0.0675, + "step": 740 + }, + { + "epoch": 0.19642739437083911, + "grad_norm": 0.10693359375, + "learning_rate": 0.0001804723578308484, + "loss": 0.0377, + "step": 745 + }, + { + "epoch": 0.19774569903104608, + "grad_norm": 0.95703125, + "learning_rate": 0.00018034041430267848, + "loss": 0.0174, + "step": 750 + }, + { + "epoch": 0.19906400369125304, + "grad_norm": 1.7890625, + "learning_rate": 0.00018020847077450852, + "loss": 0.0278, + "step": 755 + }, + { + "epoch": 0.20038230835146, + "grad_norm": 0.8515625, + "learning_rate": 0.00018007652724633856, + "loss": 0.0113, + "step": 760 + }, + { + "epoch": 0.201700613011667, + "grad_norm": 0.016845703125, + "learning_rate": 0.00017994458371816863, + "loss": 0.0589, + "step": 765 + }, + { + "epoch": 0.20301891767187397, + "grad_norm": 0.01043701171875, + "learning_rate": 0.00017981264018999867, + "loss": 0.0203, + "step": 770 + }, + { + "epoch": 0.20433722233208093, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00017968069666182874, + "loss": 0.0494, + "step": 775 + }, + { + "epoch": 0.20565552699228792, + "grad_norm": 0.56640625, + "learning_rate": 0.00017954875313365879, + "loss": 0.0394, + "step": 780 + }, + { + "epoch": 0.2069738316524949, + "grad_norm": 0.06591796875, + "learning_rate": 0.00017941680960548886, + "loss": 0.0848, + "step": 785 + }, + { + "epoch": 0.20829213631270185, + "grad_norm": 0.40234375, + "learning_rate": 0.00017928486607731892, + "loss": 0.0464, + "step": 790 + }, + { + "epoch": 0.20961044097290885, + "grad_norm": 0.06298828125, + "learning_rate": 0.00017915292254914897, + "loss": 0.0222, + "step": 795 + }, + { + "epoch": 0.2109287456331158, + "grad_norm": 0.5390625, + "learning_rate": 0.00017902097902097904, + "loss": 0.0434, + "step": 800 + }, + { + "epoch": 0.21224705029332278, + "grad_norm": 1.390625, + "learning_rate": 0.00017888903549280908, + "loss": 0.0222, + "step": 805 + }, + { + "epoch": 0.21356535495352977, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00017875709196463915, + "loss": 0.0099, + "step": 810 + }, + { + "epoch": 0.21488365961373673, + "grad_norm": 0.10009765625, + "learning_rate": 0.0001786251484364692, + "loss": 0.0086, + "step": 815 + }, + { + "epoch": 0.2162019642739437, + "grad_norm": 0.06396484375, + "learning_rate": 0.00017849320490829926, + "loss": 0.0715, + "step": 820 + }, + { + "epoch": 0.2175202689341507, + "grad_norm": 0.365234375, + "learning_rate": 0.00017836126138012933, + "loss": 0.0642, + "step": 825 + }, + { + "epoch": 0.21883857359435765, + "grad_norm": 0.01519775390625, + "learning_rate": 0.00017822931785195937, + "loss": 0.0111, + "step": 830 + }, + { + "epoch": 0.22015687825456462, + "grad_norm": 1.1640625, + "learning_rate": 0.00017809737432378944, + "loss": 0.0518, + "step": 835 + }, + { + "epoch": 0.2214751829147716, + "grad_norm": 0.00921630859375, + "learning_rate": 0.00017796543079561948, + "loss": 0.0384, + "step": 840 + }, + { + "epoch": 0.22279348757497858, + "grad_norm": 0.33984375, + "learning_rate": 0.00017783348726744955, + "loss": 0.0204, + "step": 845 + }, + { + "epoch": 0.22411179223518554, + "grad_norm": 0.294921875, + "learning_rate": 0.00017770154373927962, + "loss": 0.0075, + "step": 850 + }, + { + "epoch": 0.22543009689539253, + "grad_norm": 0.033203125, + "learning_rate": 0.00017756960021110963, + "loss": 0.0895, + "step": 855 + }, + { + "epoch": 0.2267484015555995, + "grad_norm": 0.08056640625, + "learning_rate": 0.0001774376566829397, + "loss": 0.1039, + "step": 860 + }, + { + "epoch": 0.22806670621580646, + "grad_norm": 0.55078125, + "learning_rate": 0.00017730571315476975, + "loss": 0.0125, + "step": 865 + }, + { + "epoch": 0.22938501087601346, + "grad_norm": 0.5859375, + "learning_rate": 0.00017717376962659982, + "loss": 0.0381, + "step": 870 + }, + { + "epoch": 0.23070331553622042, + "grad_norm": 0.029052734375, + "learning_rate": 0.00017704182609842988, + "loss": 0.0434, + "step": 875 + }, + { + "epoch": 0.23202162019642739, + "grad_norm": 0.43359375, + "learning_rate": 0.00017690988257025993, + "loss": 0.0799, + "step": 880 + }, + { + "epoch": 0.23333992485663438, + "grad_norm": 0.04150390625, + "learning_rate": 0.00017677793904209, + "loss": 0.0692, + "step": 885 + }, + { + "epoch": 0.23465822951684134, + "grad_norm": 0.435546875, + "learning_rate": 0.00017664599551392004, + "loss": 0.0544, + "step": 890 + }, + { + "epoch": 0.2359765341770483, + "grad_norm": 1.171875, + "learning_rate": 0.0001765140519857501, + "loss": 0.0619, + "step": 895 + }, + { + "epoch": 0.2372948388372553, + "grad_norm": 0.01263427734375, + "learning_rate": 0.00017638210845758018, + "loss": 0.0418, + "step": 900 + }, + { + "epoch": 0.23861314349746227, + "grad_norm": 0.017578125, + "learning_rate": 0.00017625016492941022, + "loss": 0.0195, + "step": 905 + }, + { + "epoch": 0.23993144815766923, + "grad_norm": 0.6171875, + "learning_rate": 0.0001761182214012403, + "loss": 0.067, + "step": 910 + }, + { + "epoch": 0.24124975281787622, + "grad_norm": 0.59765625, + "learning_rate": 0.00017598627787307033, + "loss": 0.049, + "step": 915 + }, + { + "epoch": 0.2425680574780832, + "grad_norm": 1.2421875, + "learning_rate": 0.0001758543343449004, + "loss": 0.0539, + "step": 920 + }, + { + "epoch": 0.24388636213829015, + "grad_norm": 0.10302734375, + "learning_rate": 0.00017572239081673044, + "loss": 0.0725, + "step": 925 + }, + { + "epoch": 0.24520466679849715, + "grad_norm": 0.330078125, + "learning_rate": 0.0001755904472885605, + "loss": 0.064, + "step": 930 + }, + { + "epoch": 0.2465229714587041, + "grad_norm": 0.220703125, + "learning_rate": 0.00017545850376039058, + "loss": 0.0271, + "step": 935 + }, + { + "epoch": 0.24784127611891107, + "grad_norm": 0.01470947265625, + "learning_rate": 0.00017532656023222062, + "loss": 0.0247, + "step": 940 + }, + { + "epoch": 0.24915958077911807, + "grad_norm": 0.013427734375, + "learning_rate": 0.0001751946167040507, + "loss": 0.017, + "step": 945 + }, + { + "epoch": 0.25047788543932503, + "grad_norm": 0.58984375, + "learning_rate": 0.00017506267317588073, + "loss": 0.0254, + "step": 950 + }, + { + "epoch": 0.251796190099532, + "grad_norm": 0.412109375, + "learning_rate": 0.00017493072964771078, + "loss": 0.0186, + "step": 955 + }, + { + "epoch": 0.25311449475973896, + "grad_norm": 0.66796875, + "learning_rate": 0.00017479878611954084, + "loss": 0.0617, + "step": 960 + }, + { + "epoch": 0.25443279941994595, + "grad_norm": 0.322265625, + "learning_rate": 0.00017466684259137089, + "loss": 0.0173, + "step": 965 + }, + { + "epoch": 0.25575110408015295, + "grad_norm": 0.83203125, + "learning_rate": 0.00017453489906320096, + "loss": 0.0512, + "step": 970 + }, + { + "epoch": 0.2570694087403599, + "grad_norm": 0.08447265625, + "learning_rate": 0.000174402955535031, + "loss": 0.0361, + "step": 975 + }, + { + "epoch": 0.2583877134005669, + "grad_norm": 0.423828125, + "learning_rate": 0.00017427101200686107, + "loss": 0.0175, + "step": 980 + }, + { + "epoch": 0.25970601806077387, + "grad_norm": 0.77734375, + "learning_rate": 0.00017413906847869114, + "loss": 0.0139, + "step": 985 + }, + { + "epoch": 0.2610243227209808, + "grad_norm": 0.515625, + "learning_rate": 0.00017400712495052118, + "loss": 0.0948, + "step": 990 + }, + { + "epoch": 0.2623426273811878, + "grad_norm": 1.421875, + "learning_rate": 0.00017387518142235125, + "loss": 0.0406, + "step": 995 + }, + { + "epoch": 0.2636609320413948, + "grad_norm": 0.058837890625, + "learning_rate": 0.0001737432378941813, + "loss": 0.1011, + "step": 1000 + }, + { + "epoch": 0.2636609320413948, + "eval_loss": 0.045552924275398254, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.6113, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 3.684, + "step": 1000 + }, + { + "epoch": 0.26497923670160173, + "grad_norm": 0.380859375, + "learning_rate": 0.00017361129436601136, + "loss": 0.0711, + "step": 1005 + }, + { + "epoch": 0.2662975413618087, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00017347935083784143, + "loss": 0.0218, + "step": 1010 + }, + { + "epoch": 0.2676158460220157, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017334740730967147, + "loss": 0.0301, + "step": 1015 + }, + { + "epoch": 0.26893415068222265, + "grad_norm": 0.2734375, + "learning_rate": 0.00017321546378150154, + "loss": 0.0721, + "step": 1020 + }, + { + "epoch": 0.27025245534242964, + "grad_norm": 0.25390625, + "learning_rate": 0.00017308352025333158, + "loss": 0.0363, + "step": 1025 + }, + { + "epoch": 0.27157076000263664, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017295157672516165, + "loss": 0.0313, + "step": 1030 + }, + { + "epoch": 0.2728890646628436, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0001728196331969917, + "loss": 0.0385, + "step": 1035 + }, + { + "epoch": 0.27420736932305056, + "grad_norm": 0.00787353515625, + "learning_rate": 0.00017268768966882176, + "loss": 0.0405, + "step": 1040 + }, + { + "epoch": 0.27552567398325756, + "grad_norm": 0.484375, + "learning_rate": 0.00017255574614065183, + "loss": 0.0616, + "step": 1045 + }, + { + "epoch": 0.2768439786434645, + "grad_norm": 0.0908203125, + "learning_rate": 0.00017242380261248185, + "loss": 0.0057, + "step": 1050 + }, + { + "epoch": 0.2781622833036715, + "grad_norm": 0.1904296875, + "learning_rate": 0.00017229185908431192, + "loss": 0.0417, + "step": 1055 + }, + { + "epoch": 0.2794805879638785, + "grad_norm": 0.30078125, + "learning_rate": 0.00017215991555614196, + "loss": 0.0346, + "step": 1060 + }, + { + "epoch": 0.2807988926240854, + "grad_norm": 0.016357421875, + "learning_rate": 0.00017202797202797203, + "loss": 0.0295, + "step": 1065 + }, + { + "epoch": 0.2821171972842924, + "grad_norm": 0.490234375, + "learning_rate": 0.0001718960284998021, + "loss": 0.0448, + "step": 1070 + }, + { + "epoch": 0.28343550194449935, + "grad_norm": 0.004241943359375, + "learning_rate": 0.00017176408497163214, + "loss": 0.0051, + "step": 1075 + }, + { + "epoch": 0.28475380660470634, + "grad_norm": 0.01904296875, + "learning_rate": 0.0001716321414434622, + "loss": 0.0894, + "step": 1080 + }, + { + "epoch": 0.28607211126491333, + "grad_norm": 0.83984375, + "learning_rate": 0.00017150019791529225, + "loss": 0.0288, + "step": 1085 + }, + { + "epoch": 0.28739041592512027, + "grad_norm": 0.2021484375, + "learning_rate": 0.00017136825438712232, + "loss": 0.0222, + "step": 1090 + }, + { + "epoch": 0.28870872058532726, + "grad_norm": 0.322265625, + "learning_rate": 0.0001712363108589524, + "loss": 0.0444, + "step": 1095 + }, + { + "epoch": 0.29002702524553425, + "grad_norm": 0.408203125, + "learning_rate": 0.00017110436733078243, + "loss": 0.0828, + "step": 1100 + }, + { + "epoch": 0.2913453299057412, + "grad_norm": 0.04052734375, + "learning_rate": 0.0001709724238026125, + "loss": 0.0725, + "step": 1105 + }, + { + "epoch": 0.2926636345659482, + "grad_norm": 0.2578125, + "learning_rate": 0.00017084048027444254, + "loss": 0.0204, + "step": 1110 + }, + { + "epoch": 0.2939819392261552, + "grad_norm": 0.67578125, + "learning_rate": 0.0001707085367462726, + "loss": 0.0503, + "step": 1115 + }, + { + "epoch": 0.2953002438863621, + "grad_norm": 0.0059814453125, + "learning_rate": 0.00017057659321810265, + "loss": 0.0144, + "step": 1120 + }, + { + "epoch": 0.2966185485465691, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00017044464968993272, + "loss": 0.0044, + "step": 1125 + }, + { + "epoch": 0.2979368532067761, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001703127061617628, + "loss": 0.013, + "step": 1130 + }, + { + "epoch": 0.29925515786698303, + "grad_norm": 0.287109375, + "learning_rate": 0.00017018076263359283, + "loss": 0.0245, + "step": 1135 + }, + { + "epoch": 0.30057346252719, + "grad_norm": 0.26171875, + "learning_rate": 0.0001700488191054229, + "loss": 0.0247, + "step": 1140 + }, + { + "epoch": 0.301891767187397, + "grad_norm": 0.40625, + "learning_rate": 0.00016991687557725294, + "loss": 0.0402, + "step": 1145 + }, + { + "epoch": 0.30321007184760396, + "grad_norm": 1.2578125, + "learning_rate": 0.000169784932049083, + "loss": 0.0071, + "step": 1150 + }, + { + "epoch": 0.30452837650781095, + "grad_norm": 0.330078125, + "learning_rate": 0.00016965298852091306, + "loss": 0.0177, + "step": 1155 + }, + { + "epoch": 0.30584668116801794, + "grad_norm": 0.07275390625, + "learning_rate": 0.0001695210449927431, + "loss": 0.0029, + "step": 1160 + }, + { + "epoch": 0.3071649858282249, + "grad_norm": 0.455078125, + "learning_rate": 0.00016938910146457317, + "loss": 0.0262, + "step": 1165 + }, + { + "epoch": 0.30848329048843187, + "grad_norm": 0.002655029296875, + "learning_rate": 0.0001692571579364032, + "loss": 0.0346, + "step": 1170 + }, + { + "epoch": 0.30980159514863886, + "grad_norm": 0.1748046875, + "learning_rate": 0.00016912521440823328, + "loss": 0.0494, + "step": 1175 + }, + { + "epoch": 0.3111198998088458, + "grad_norm": 1.4609375, + "learning_rate": 0.00016899327088006335, + "loss": 0.0603, + "step": 1180 + }, + { + "epoch": 0.3124382044690528, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001688613273518934, + "loss": 0.0366, + "step": 1185 + }, + { + "epoch": 0.3137565091292598, + "grad_norm": 0.01422119140625, + "learning_rate": 0.00016872938382372346, + "loss": 0.0678, + "step": 1190 + }, + { + "epoch": 0.3150748137894667, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001685974402955535, + "loss": 0.0359, + "step": 1195 + }, + { + "epoch": 0.3163931184496737, + "grad_norm": 0.275390625, + "learning_rate": 0.00016846549676738357, + "loss": 0.1099, + "step": 1200 + }, + { + "epoch": 0.3177114231098807, + "grad_norm": 0.212890625, + "learning_rate": 0.00016833355323921364, + "loss": 0.0343, + "step": 1205 + }, + { + "epoch": 0.31902972777008765, + "grad_norm": 0.0302734375, + "learning_rate": 0.00016820160971104368, + "loss": 0.0138, + "step": 1210 + }, + { + "epoch": 0.32034803243029464, + "grad_norm": 0.016845703125, + "learning_rate": 0.00016806966618287375, + "loss": 0.0202, + "step": 1215 + }, + { + "epoch": 0.32166633709050163, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001679377226547038, + "loss": 0.0442, + "step": 1220 + }, + { + "epoch": 0.32298464175070857, + "grad_norm": 0.049072265625, + "learning_rate": 0.00016780577912653386, + "loss": 0.0375, + "step": 1225 + }, + { + "epoch": 0.32430294641091556, + "grad_norm": 0.1337890625, + "learning_rate": 0.0001676738355983639, + "loss": 0.01, + "step": 1230 + }, + { + "epoch": 0.32562125107112255, + "grad_norm": 0.02197265625, + "learning_rate": 0.00016754189207019397, + "loss": 0.0139, + "step": 1235 + }, + { + "epoch": 0.3269395557313295, + "grad_norm": 0.09228515625, + "learning_rate": 0.00016740994854202404, + "loss": 0.014, + "step": 1240 + }, + { + "epoch": 0.3282578603915365, + "grad_norm": 0.47265625, + "learning_rate": 0.00016727800501385408, + "loss": 0.1546, + "step": 1245 + }, + { + "epoch": 0.3295761650517435, + "grad_norm": 0.02294921875, + "learning_rate": 0.00016714606148568413, + "loss": 0.0803, + "step": 1250 + }, + { + "epoch": 0.3308944697119504, + "grad_norm": 0.185546875, + "learning_rate": 0.00016701411795751417, + "loss": 0.0376, + "step": 1255 + }, + { + "epoch": 0.3322127743721574, + "grad_norm": 0.1123046875, + "learning_rate": 0.00016688217442934424, + "loss": 0.0375, + "step": 1260 + }, + { + "epoch": 0.3335310790323644, + "grad_norm": 1.03125, + "learning_rate": 0.0001667502309011743, + "loss": 0.0442, + "step": 1265 + }, + { + "epoch": 0.33484938369257133, + "grad_norm": 0.0172119140625, + "learning_rate": 0.00016661828737300435, + "loss": 0.0261, + "step": 1270 + }, + { + "epoch": 0.3361676883527783, + "grad_norm": 0.42578125, + "learning_rate": 0.00016648634384483442, + "loss": 0.0553, + "step": 1275 + }, + { + "epoch": 0.3374859930129853, + "grad_norm": 0.1328125, + "learning_rate": 0.00016635440031666446, + "loss": 0.0065, + "step": 1280 + }, + { + "epoch": 0.33880429767319226, + "grad_norm": 0.263671875, + "learning_rate": 0.00016622245678849453, + "loss": 0.0527, + "step": 1285 + }, + { + "epoch": 0.34012260233339925, + "grad_norm": 0.314453125, + "learning_rate": 0.0001660905132603246, + "loss": 0.0297, + "step": 1290 + }, + { + "epoch": 0.34144090699360624, + "grad_norm": 0.04345703125, + "learning_rate": 0.00016595856973215464, + "loss": 0.0477, + "step": 1295 + }, + { + "epoch": 0.3427592116538132, + "grad_norm": 0.08154296875, + "learning_rate": 0.0001658266262039847, + "loss": 0.0298, + "step": 1300 + }, + { + "epoch": 0.34407751631402017, + "grad_norm": 0.08935546875, + "learning_rate": 0.00016569468267581475, + "loss": 0.0481, + "step": 1305 + }, + { + "epoch": 0.34539582097422716, + "grad_norm": 0.06640625, + "learning_rate": 0.00016556273914764482, + "loss": 0.0153, + "step": 1310 + }, + { + "epoch": 0.3467141256344341, + "grad_norm": 0.00592041015625, + "learning_rate": 0.00016543079561947486, + "loss": 0.0111, + "step": 1315 + }, + { + "epoch": 0.3480324302946411, + "grad_norm": 0.2236328125, + "learning_rate": 0.00016529885209130493, + "loss": 0.0309, + "step": 1320 + }, + { + "epoch": 0.3493507349548481, + "grad_norm": 0.0198974609375, + "learning_rate": 0.000165166908563135, + "loss": 0.0579, + "step": 1325 + }, + { + "epoch": 0.350669039615055, + "grad_norm": 0.10107421875, + "learning_rate": 0.00016503496503496504, + "loss": 0.0055, + "step": 1330 + }, + { + "epoch": 0.351987344275262, + "grad_norm": 0.71875, + "learning_rate": 0.00016490302150679511, + "loss": 0.0299, + "step": 1335 + }, + { + "epoch": 0.353305648935469, + "grad_norm": 0.01348876953125, + "learning_rate": 0.00016477107797862516, + "loss": 0.0943, + "step": 1340 + }, + { + "epoch": 0.35462395359567594, + "grad_norm": 0.3046875, + "learning_rate": 0.00016463913445045523, + "loss": 0.0216, + "step": 1345 + }, + { + "epoch": 0.35594225825588294, + "grad_norm": 0.02392578125, + "learning_rate": 0.00016450719092228527, + "loss": 0.0265, + "step": 1350 + }, + { + "epoch": 0.35726056291608993, + "grad_norm": 0.453125, + "learning_rate": 0.0001643752473941153, + "loss": 0.0539, + "step": 1355 + }, + { + "epoch": 0.35857886757629687, + "grad_norm": 0.00823974609375, + "learning_rate": 0.00016424330386594538, + "loss": 0.0139, + "step": 1360 + }, + { + "epoch": 0.35989717223650386, + "grad_norm": 0.55859375, + "learning_rate": 0.00016411136033777542, + "loss": 0.0428, + "step": 1365 + }, + { + "epoch": 0.36121547689671085, + "grad_norm": 0.052734375, + "learning_rate": 0.0001639794168096055, + "loss": 0.0346, + "step": 1370 + }, + { + "epoch": 0.3625337815569178, + "grad_norm": 0.12158203125, + "learning_rate": 0.00016384747328143556, + "loss": 0.0095, + "step": 1375 + }, + { + "epoch": 0.3638520862171248, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0001637155297532656, + "loss": 0.0224, + "step": 1380 + }, + { + "epoch": 0.3651703908773318, + "grad_norm": 0.01318359375, + "learning_rate": 0.00016358358622509567, + "loss": 0.0316, + "step": 1385 + }, + { + "epoch": 0.3664886955375387, + "grad_norm": 0.011962890625, + "learning_rate": 0.0001634516426969257, + "loss": 0.0051, + "step": 1390 + }, + { + "epoch": 0.3678070001977457, + "grad_norm": 0.00396728515625, + "learning_rate": 0.00016331969916875578, + "loss": 0.038, + "step": 1395 + }, + { + "epoch": 0.3691253048579527, + "grad_norm": 0.375, + "learning_rate": 0.00016318775564058585, + "loss": 0.029, + "step": 1400 + }, + { + "epoch": 0.37044360951815963, + "grad_norm": 0.265625, + "learning_rate": 0.0001630558121124159, + "loss": 0.0072, + "step": 1405 + }, + { + "epoch": 0.3717619141783666, + "grad_norm": 0.00127410888671875, + "learning_rate": 0.00016292386858424596, + "loss": 0.0381, + "step": 1410 + }, + { + "epoch": 0.3730802188385736, + "grad_norm": 1.15625, + "learning_rate": 0.000162791925056076, + "loss": 0.0573, + "step": 1415 + }, + { + "epoch": 0.37439852349878056, + "grad_norm": 0.0244140625, + "learning_rate": 0.00016265998152790607, + "loss": 0.051, + "step": 1420 + }, + { + "epoch": 0.37571682815898755, + "grad_norm": 0.0015106201171875, + "learning_rate": 0.00016252803799973612, + "loss": 0.0239, + "step": 1425 + }, + { + "epoch": 0.37703513281919454, + "grad_norm": 0.26953125, + "learning_rate": 0.00016239609447156618, + "loss": 0.0165, + "step": 1430 + }, + { + "epoch": 0.3783534374794015, + "grad_norm": 0.006134033203125, + "learning_rate": 0.00016226415094339625, + "loss": 0.0071, + "step": 1435 + }, + { + "epoch": 0.37967174213960847, + "grad_norm": 2.828125, + "learning_rate": 0.0001621322074152263, + "loss": 0.0272, + "step": 1440 + }, + { + "epoch": 0.38099004679981546, + "grad_norm": 0.349609375, + "learning_rate": 0.00016200026388705637, + "loss": 0.0647, + "step": 1445 + }, + { + "epoch": 0.3823083514600224, + "grad_norm": 0.09326171875, + "learning_rate": 0.00016186832035888638, + "loss": 0.0262, + "step": 1450 + }, + { + "epoch": 0.3836266561202294, + "grad_norm": 0.041015625, + "learning_rate": 0.00016173637683071645, + "loss": 0.0576, + "step": 1455 + }, + { + "epoch": 0.3849449607804364, + "grad_norm": 0.033935546875, + "learning_rate": 0.00016160443330254652, + "loss": 0.0142, + "step": 1460 + }, + { + "epoch": 0.3862632654406433, + "grad_norm": 0.09130859375, + "learning_rate": 0.00016147248977437656, + "loss": 0.0348, + "step": 1465 + }, + { + "epoch": 0.3875815701008503, + "grad_norm": 2.390625, + "learning_rate": 0.00016134054624620663, + "loss": 0.0672, + "step": 1470 + }, + { + "epoch": 0.3888998747610573, + "grad_norm": 0.439453125, + "learning_rate": 0.00016120860271803667, + "loss": 0.0121, + "step": 1475 + }, + { + "epoch": 0.39021817942126424, + "grad_norm": 0.1298828125, + "learning_rate": 0.00016107665918986674, + "loss": 0.0114, + "step": 1480 + }, + { + "epoch": 0.39153648408147124, + "grad_norm": 0.85546875, + "learning_rate": 0.0001609447156616968, + "loss": 0.0968, + "step": 1485 + }, + { + "epoch": 0.39285478874167823, + "grad_norm": 0.703125, + "learning_rate": 0.00016081277213352685, + "loss": 0.0349, + "step": 1490 + }, + { + "epoch": 0.39417309340188517, + "grad_norm": 0.021728515625, + "learning_rate": 0.00016068082860535692, + "loss": 0.0106, + "step": 1495 + }, + { + "epoch": 0.39549139806209216, + "grad_norm": 0.7265625, + "learning_rate": 0.00016054888507718696, + "loss": 0.0225, + "step": 1500 + }, + { + "epoch": 0.39549139806209216, + "eval_loss": 0.03515048325061798, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.3497, + "eval_samples_per_second": 7.373, + "eval_steps_per_second": 3.686, + "step": 1500 + }, + { + "epoch": 0.3968097027222991, + "grad_norm": 0.016519820317626, + "learning_rate": 0.00016041694154901703, + "loss": 0.0202, + "step": 1505 + }, + { + "epoch": 0.3981280073825061, + "grad_norm": 0.8505942225456238, + "learning_rate": 0.00016028499802084708, + "loss": 0.0541, + "step": 1510 + }, + { + "epoch": 0.3994463120427131, + "grad_norm": 0.04163295030593872, + "learning_rate": 0.00016015305449267714, + "loss": 0.0037, + "step": 1515 + }, + { + "epoch": 0.40076461670292, + "grad_norm": 0.011332935653626919, + "learning_rate": 0.00016002111096450721, + "loss": 0.0459, + "step": 1520 + }, + { + "epoch": 0.402082921363127, + "grad_norm": 0.9360129833221436, + "learning_rate": 0.00015988916743633726, + "loss": 0.013, + "step": 1525 + }, + { + "epoch": 0.403401226023334, + "grad_norm": 0.11991436779499054, + "learning_rate": 0.00015975722390816733, + "loss": 0.0079, + "step": 1530 + }, + { + "epoch": 0.40471953068354094, + "grad_norm": 0.36911076307296753, + "learning_rate": 0.00015962528037999737, + "loss": 0.0638, + "step": 1535 + }, + { + "epoch": 0.40603783534374793, + "grad_norm": 0.020278634503483772, + "learning_rate": 0.00015949333685182744, + "loss": 0.0217, + "step": 1540 + }, + { + "epoch": 0.4073561400039549, + "grad_norm": 0.14263059198856354, + "learning_rate": 0.0001593613933236575, + "loss": 0.0495, + "step": 1545 + }, + { + "epoch": 0.40867444466416186, + "grad_norm": 0.09494803845882416, + "learning_rate": 0.00015922944979548752, + "loss": 0.0248, + "step": 1550 + }, + { + "epoch": 0.40999274932436885, + "grad_norm": 0.23064319789409637, + "learning_rate": 0.0001590975062673176, + "loss": 0.0285, + "step": 1555 + }, + { + "epoch": 0.41131105398457585, + "grad_norm": 0.32220256328582764, + "learning_rate": 0.00015896556273914763, + "loss": 0.0537, + "step": 1560 + }, + { + "epoch": 0.4126293586447828, + "grad_norm": 0.41208815574645996, + "learning_rate": 0.0001588336192109777, + "loss": 0.0453, + "step": 1565 + }, + { + "epoch": 0.4139476633049898, + "grad_norm": 0.03775424137711525, + "learning_rate": 0.00015870167568280777, + "loss": 0.0134, + "step": 1570 + }, + { + "epoch": 0.41526596796519677, + "grad_norm": 0.6526333093643188, + "learning_rate": 0.0001585697321546378, + "loss": 0.0329, + "step": 1575 + }, + { + "epoch": 0.4165842726254037, + "grad_norm": 1.001305103302002, + "learning_rate": 0.00015843778862646788, + "loss": 0.0912, + "step": 1580 + }, + { + "epoch": 0.4179025772856107, + "grad_norm": 0.4055219888687134, + "learning_rate": 0.00015830584509829792, + "loss": 0.0519, + "step": 1585 + }, + { + "epoch": 0.4192208819458177, + "grad_norm": 0.035015616565942764, + "learning_rate": 0.000158173901570128, + "loss": 0.0191, + "step": 1590 + }, + { + "epoch": 0.42053918660602463, + "grad_norm": 0.09326844662427902, + "learning_rate": 0.00015804195804195806, + "loss": 0.0106, + "step": 1595 + }, + { + "epoch": 0.4218574912662316, + "grad_norm": 0.06223440542817116, + "learning_rate": 0.0001579100145137881, + "loss": 0.0113, + "step": 1600 + }, + { + "epoch": 0.4231757959264386, + "grad_norm": 0.0625135526061058, + "learning_rate": 0.00015777807098561817, + "loss": 0.0191, + "step": 1605 + }, + { + "epoch": 0.42449410058664555, + "grad_norm": 0.2645983099937439, + "learning_rate": 0.00015764612745744822, + "loss": 0.0829, + "step": 1610 + }, + { + "epoch": 0.42581240524685254, + "grad_norm": 0.009632415138185024, + "learning_rate": 0.00015751418392927829, + "loss": 0.0542, + "step": 1615 + }, + { + "epoch": 0.42713070990705954, + "grad_norm": 0.01979319378733635, + "learning_rate": 0.00015738224040110833, + "loss": 0.0517, + "step": 1620 + }, + { + "epoch": 0.4284490145672665, + "grad_norm": 0.3065454065799713, + "learning_rate": 0.0001572502968729384, + "loss": 0.0738, + "step": 1625 + }, + { + "epoch": 0.42976731922747347, + "grad_norm": 0.09581473469734192, + "learning_rate": 0.00015711835334476847, + "loss": 0.0571, + "step": 1630 + }, + { + "epoch": 0.43108562388768046, + "grad_norm": 0.23746591806411743, + "learning_rate": 0.0001569864098165985, + "loss": 0.0128, + "step": 1635 + }, + { + "epoch": 0.4324039285478874, + "grad_norm": 0.936278760433197, + "learning_rate": 0.00015685446628842858, + "loss": 0.0665, + "step": 1640 + }, + { + "epoch": 0.4337222332080944, + "grad_norm": 0.18487441539764404, + "learning_rate": 0.00015672252276025862, + "loss": 0.0527, + "step": 1645 + }, + { + "epoch": 0.4350405378683014, + "grad_norm": 0.6980624794960022, + "learning_rate": 0.00015659057923208866, + "loss": 0.0613, + "step": 1650 + }, + { + "epoch": 0.4363588425285083, + "grad_norm": 0.4696301221847534, + "learning_rate": 0.00015645863570391873, + "loss": 0.0569, + "step": 1655 + }, + { + "epoch": 0.4376771471887153, + "grad_norm": 0.15083105862140656, + "learning_rate": 0.00015632669217574877, + "loss": 0.0394, + "step": 1660 + }, + { + "epoch": 0.4389954518489223, + "grad_norm": 0.44701239466667175, + "learning_rate": 0.00015619474864757884, + "loss": 0.0494, + "step": 1665 + }, + { + "epoch": 0.44031375650912924, + "grad_norm": 0.07418403029441833, + "learning_rate": 0.00015606280511940888, + "loss": 0.0291, + "step": 1670 + }, + { + "epoch": 0.44163206116933623, + "grad_norm": 0.02311861515045166, + "learning_rate": 0.00015593086159123895, + "loss": 0.0304, + "step": 1675 + }, + { + "epoch": 0.4429503658295432, + "grad_norm": 0.4416038990020752, + "learning_rate": 0.00015579891806306902, + "loss": 0.0176, + "step": 1680 + }, + { + "epoch": 0.44426867048975016, + "grad_norm": 0.5124915242195129, + "learning_rate": 0.00015566697453489906, + "loss": 0.0454, + "step": 1685 + }, + { + "epoch": 0.44558697514995715, + "grad_norm": 0.3159286081790924, + "learning_rate": 0.00015553503100672913, + "loss": 0.047, + "step": 1690 + }, + { + "epoch": 0.44690527981016415, + "grad_norm": 0.032126396894454956, + "learning_rate": 0.00015540308747855918, + "loss": 0.0151, + "step": 1695 + }, + { + "epoch": 0.4482235844703711, + "grad_norm": 0.04663548618555069, + "learning_rate": 0.00015527114395038924, + "loss": 0.0375, + "step": 1700 + }, + { + "epoch": 0.4495418891305781, + "grad_norm": 0.013753900304436684, + "learning_rate": 0.0001551392004222193, + "loss": 0.0485, + "step": 1705 + }, + { + "epoch": 0.45086019379078507, + "grad_norm": 1.9952393770217896, + "learning_rate": 0.00015500725689404936, + "loss": 0.0625, + "step": 1710 + }, + { + "epoch": 0.452178498450992, + "grad_norm": 0.014283270575106144, + "learning_rate": 0.00015487531336587943, + "loss": 0.0037, + "step": 1715 + }, + { + "epoch": 0.453496803111199, + "grad_norm": 0.3897913098335266, + "learning_rate": 0.00015474336983770947, + "loss": 0.0304, + "step": 1720 + }, + { + "epoch": 0.454815107771406, + "grad_norm": 0.3730885684490204, + "learning_rate": 0.00015461142630953954, + "loss": 0.0115, + "step": 1725 + }, + { + "epoch": 0.45613341243161293, + "grad_norm": 0.035858724266290665, + "learning_rate": 0.00015447948278136958, + "loss": 0.0021, + "step": 1730 + }, + { + "epoch": 0.4574517170918199, + "grad_norm": 0.20589517056941986, + "learning_rate": 0.00015434753925319965, + "loss": 0.0132, + "step": 1735 + }, + { + "epoch": 0.4587700217520269, + "grad_norm": 0.004939342383295298, + "learning_rate": 0.00015421559572502972, + "loss": 0.0471, + "step": 1740 + }, + { + "epoch": 0.46008832641223385, + "grad_norm": 0.03493283689022064, + "learning_rate": 0.00015408365219685976, + "loss": 0.0062, + "step": 1745 + }, + { + "epoch": 0.46140663107244084, + "grad_norm": 0.045927103608846664, + "learning_rate": 0.0001539517086686898, + "loss": 0.0283, + "step": 1750 + }, + { + "epoch": 0.46272493573264784, + "grad_norm": 0.012629454955458641, + "learning_rate": 0.00015381976514051984, + "loss": 0.0133, + "step": 1755 + }, + { + "epoch": 0.46404324039285477, + "grad_norm": 0.8001697659492493, + "learning_rate": 0.0001536878216123499, + "loss": 0.0224, + "step": 1760 + }, + { + "epoch": 0.46536154505306176, + "grad_norm": 0.002036362886428833, + "learning_rate": 0.00015355587808417998, + "loss": 0.0066, + "step": 1765 + }, + { + "epoch": 0.46667984971326876, + "grad_norm": 1.0261330604553223, + "learning_rate": 0.00015342393455601002, + "loss": 0.191, + "step": 1770 + }, + { + "epoch": 0.4679981543734757, + "grad_norm": 0.3033429682254791, + "learning_rate": 0.0001532919910278401, + "loss": 0.0222, + "step": 1775 + }, + { + "epoch": 0.4693164590336827, + "grad_norm": 0.36911338567733765, + "learning_rate": 0.00015316004749967014, + "loss": 0.0363, + "step": 1780 + }, + { + "epoch": 0.4706347636938897, + "grad_norm": 0.0406811460852623, + "learning_rate": 0.0001530281039715002, + "loss": 0.0283, + "step": 1785 + }, + { + "epoch": 0.4719530683540966, + "grad_norm": 0.23334211111068726, + "learning_rate": 0.00015289616044333027, + "loss": 0.0274, + "step": 1790 + }, + { + "epoch": 0.4732713730143036, + "grad_norm": 0.013081169687211514, + "learning_rate": 0.00015276421691516032, + "loss": 0.0221, + "step": 1795 + }, + { + "epoch": 0.4745896776745106, + "grad_norm": 0.2480790615081787, + "learning_rate": 0.00015263227338699039, + "loss": 0.019, + "step": 1800 + }, + { + "epoch": 0.47590798233471754, + "grad_norm": 0.0373196005821228, + "learning_rate": 0.00015250032985882043, + "loss": 0.0292, + "step": 1805 + }, + { + "epoch": 0.47722628699492453, + "grad_norm": 0.004609994124621153, + "learning_rate": 0.0001523683863306505, + "loss": 0.0918, + "step": 1810 + }, + { + "epoch": 0.4785445916551315, + "grad_norm": 0.02370987832546234, + "learning_rate": 0.00015223644280248054, + "loss": 0.0462, + "step": 1815 + }, + { + "epoch": 0.47986289631533846, + "grad_norm": 0.05842221528291702, + "learning_rate": 0.0001521044992743106, + "loss": 0.0595, + "step": 1820 + }, + { + "epoch": 0.48118120097554545, + "grad_norm": 0.009685276076197624, + "learning_rate": 0.00015197255574614068, + "loss": 0.0074, + "step": 1825 + }, + { + "epoch": 0.48249950563575245, + "grad_norm": 0.8933250308036804, + "learning_rate": 0.00015184061221797072, + "loss": 0.0757, + "step": 1830 + }, + { + "epoch": 0.4838178102959594, + "grad_norm": 0.07075401395559311, + "learning_rate": 0.0001517086686898008, + "loss": 0.0226, + "step": 1835 + }, + { + "epoch": 0.4851361149561664, + "grad_norm": 0.732706606388092, + "learning_rate": 0.00015157672516163083, + "loss": 0.0161, + "step": 1840 + }, + { + "epoch": 0.48645441961637337, + "grad_norm": 1.1897023916244507, + "learning_rate": 0.0001514447816334609, + "loss": 0.0265, + "step": 1845 + }, + { + "epoch": 0.4877727242765803, + "grad_norm": 0.052572328597307205, + "learning_rate": 0.00015131283810529094, + "loss": 0.0094, + "step": 1850 + }, + { + "epoch": 0.4890910289367873, + "grad_norm": 0.08263898640871048, + "learning_rate": 0.00015118089457712098, + "loss": 0.0631, + "step": 1855 + }, + { + "epoch": 0.4904093335969943, + "grad_norm": 0.03225664421916008, + "learning_rate": 0.00015104895104895105, + "loss": 0.023, + "step": 1860 + }, + { + "epoch": 0.4917276382572012, + "grad_norm": 0.007935039699077606, + "learning_rate": 0.0001509170075207811, + "loss": 0.0039, + "step": 1865 + }, + { + "epoch": 0.4930459429174082, + "grad_norm": 0.00830796267837286, + "learning_rate": 0.00015078506399261116, + "loss": 0.007, + "step": 1870 + }, + { + "epoch": 0.4943642475776152, + "grad_norm": 0.08042234182357788, + "learning_rate": 0.00015065312046444123, + "loss": 0.0366, + "step": 1875 + }, + { + "epoch": 0.49568255223782215, + "grad_norm": 0.009092851541936398, + "learning_rate": 0.00015052117693627128, + "loss": 0.0107, + "step": 1880 + }, + { + "epoch": 0.49700085689802914, + "grad_norm": 0.2674141824245453, + "learning_rate": 0.00015038923340810135, + "loss": 0.0076, + "step": 1885 + }, + { + "epoch": 0.49831916155823613, + "grad_norm": 0.07694366574287415, + "learning_rate": 0.0001502572898799314, + "loss": 0.0252, + "step": 1890 + }, + { + "epoch": 0.49963746621844307, + "grad_norm": 0.5699467062950134, + "learning_rate": 0.00015012534635176146, + "loss": 0.0487, + "step": 1895 + }, + { + "epoch": 0.5009557708786501, + "grad_norm": 0.18800878524780273, + "learning_rate": 0.0001499934028235915, + "loss": 0.0183, + "step": 1900 + }, + { + "epoch": 0.5022740755388571, + "grad_norm": 0.019469989463686943, + "learning_rate": 0.00014986145929542157, + "loss": 0.0268, + "step": 1905 + }, + { + "epoch": 0.503592380199064, + "grad_norm": 0.01890506222844124, + "learning_rate": 0.00014972951576725164, + "loss": 0.0449, + "step": 1910 + }, + { + "epoch": 0.5049106848592709, + "grad_norm": 0.0006314461352303624, + "learning_rate": 0.00014959757223908168, + "loss": 0.0056, + "step": 1915 + }, + { + "epoch": 0.5062289895194779, + "grad_norm": 0.32654041051864624, + "learning_rate": 0.00014946562871091175, + "loss": 0.0256, + "step": 1920 + }, + { + "epoch": 0.5075472941796849, + "grad_norm": 0.7803483605384827, + "learning_rate": 0.0001493336851827418, + "loss": 0.0374, + "step": 1925 + }, + { + "epoch": 0.5088655988398919, + "grad_norm": 0.028441445901989937, + "learning_rate": 0.00014920174165457186, + "loss": 0.0161, + "step": 1930 + }, + { + "epoch": 0.5101839035000989, + "grad_norm": 0.028379200026392937, + "learning_rate": 0.00014906979812640193, + "loss": 0.0151, + "step": 1935 + }, + { + "epoch": 0.5115022081603059, + "grad_norm": 0.021159596741199493, + "learning_rate": 0.00014893785459823197, + "loss": 0.0303, + "step": 1940 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.24903325736522675, + "learning_rate": 0.000148805911070062, + "loss": 0.0076, + "step": 1945 + }, + { + "epoch": 0.5141388174807198, + "grad_norm": 0.007065301761031151, + "learning_rate": 0.00014867396754189206, + "loss": 0.022, + "step": 1950 + }, + { + "epoch": 0.5154571221409268, + "grad_norm": 0.004032329190522432, + "learning_rate": 0.00014854202401372212, + "loss": 0.0083, + "step": 1955 + }, + { + "epoch": 0.5167754268011338, + "grad_norm": 0.3045775592327118, + "learning_rate": 0.0001484100804855522, + "loss": 0.0113, + "step": 1960 + }, + { + "epoch": 0.5180937314613407, + "grad_norm": 0.36974939703941345, + "learning_rate": 0.00014827813695738224, + "loss": 0.0267, + "step": 1965 + }, + { + "epoch": 0.5194120361215477, + "grad_norm": 0.009729950688779354, + "learning_rate": 0.0001481461934292123, + "loss": 0.027, + "step": 1970 + }, + { + "epoch": 0.5207303407817546, + "grad_norm": 0.0013097926275804639, + "learning_rate": 0.00014801424990104235, + "loss": 0.003, + "step": 1975 + }, + { + "epoch": 0.5220486454419616, + "grad_norm": 0.0706263929605484, + "learning_rate": 0.00014788230637287242, + "loss": 0.0193, + "step": 1980 + }, + { + "epoch": 0.5233669501021686, + "grad_norm": 1.435702919960022, + "learning_rate": 0.00014775036284470249, + "loss": 0.0647, + "step": 1985 + }, + { + "epoch": 0.5246852547623756, + "grad_norm": 0.00661757867783308, + "learning_rate": 0.00014761841931653253, + "loss": 0.0373, + "step": 1990 + }, + { + "epoch": 0.5260035594225826, + "grad_norm": 0.12014541029930115, + "learning_rate": 0.0001474864757883626, + "loss": 0.0178, + "step": 1995 + }, + { + "epoch": 0.5273218640827896, + "grad_norm": 1.0549248456954956, + "learning_rate": 0.00014735453226019264, + "loss": 0.0191, + "step": 2000 + }, + { + "epoch": 0.5273218640827896, + "eval_loss": 0.037292081862688065, + "eval_runtime": 454.3033, + "eval_samples_per_second": 7.422, + "eval_steps_per_second": 3.711, + "step": 2000 + }, + { + "epoch": 0.5286401687429965, + "grad_norm": 0.47634151577949524, + "learning_rate": 0.0001472225887320227, + "loss": 0.0404, + "step": 2005 + }, + { + "epoch": 0.5299584734032035, + "grad_norm": 0.006752463988959789, + "learning_rate": 0.00014709064520385275, + "loss": 0.034, + "step": 2010 + }, + { + "epoch": 0.5312767780634104, + "grad_norm": 0.20780125260353088, + "learning_rate": 0.00014695870167568282, + "loss": 0.0421, + "step": 2015 + }, + { + "epoch": 0.5325950827236174, + "grad_norm": 0.010941066779196262, + "learning_rate": 0.0001468267581475129, + "loss": 0.0086, + "step": 2020 + }, + { + "epoch": 0.5339133873838244, + "grad_norm": 0.3439581096172333, + "learning_rate": 0.00014669481461934293, + "loss": 0.0187, + "step": 2025 + }, + { + "epoch": 0.5352316920440314, + "grad_norm": 0.14961636066436768, + "learning_rate": 0.000146562871091173, + "loss": 0.0504, + "step": 2030 + }, + { + "epoch": 0.5365499967042383, + "grad_norm": 0.0044641937129199505, + "learning_rate": 0.00014643092756300304, + "loss": 0.0134, + "step": 2035 + }, + { + "epoch": 0.5378683013644453, + "grad_norm": 0.14088386297225952, + "learning_rate": 0.0001462989840348331, + "loss": 0.0096, + "step": 2040 + }, + { + "epoch": 0.5391866060246523, + "grad_norm": 0.48116979002952576, + "learning_rate": 0.00014616704050666315, + "loss": 0.0124, + "step": 2045 + }, + { + "epoch": 0.5405049106848593, + "grad_norm": 0.3688766360282898, + "learning_rate": 0.0001460350969784932, + "loss": 0.0226, + "step": 2050 + }, + { + "epoch": 0.5418232153450663, + "grad_norm": 0.002938181860372424, + "learning_rate": 0.00014590315345032326, + "loss": 0.0267, + "step": 2055 + }, + { + "epoch": 0.5431415200052733, + "grad_norm": 0.3335214853286743, + "learning_rate": 0.0001457712099221533, + "loss": 0.0367, + "step": 2060 + }, + { + "epoch": 0.5444598246654802, + "grad_norm": 0.004644686821848154, + "learning_rate": 0.00014563926639398338, + "loss": 0.0121, + "step": 2065 + }, + { + "epoch": 0.5457781293256871, + "grad_norm": 0.19505545496940613, + "learning_rate": 0.00014550732286581345, + "loss": 0.0591, + "step": 2070 + }, + { + "epoch": 0.5470964339858941, + "grad_norm": 0.018028756603598595, + "learning_rate": 0.0001453753793376435, + "loss": 0.0131, + "step": 2075 + }, + { + "epoch": 0.5484147386461011, + "grad_norm": 0.045639291405677795, + "learning_rate": 0.00014524343580947356, + "loss": 0.0443, + "step": 2080 + }, + { + "epoch": 0.5497330433063081, + "grad_norm": 0.727981686592102, + "learning_rate": 0.0001451114922813036, + "loss": 0.0205, + "step": 2085 + }, + { + "epoch": 0.5510513479665151, + "grad_norm": 0.03766491636633873, + "learning_rate": 0.00014497954875313367, + "loss": 0.0067, + "step": 2090 + }, + { + "epoch": 0.552369652626722, + "grad_norm": 0.1911504715681076, + "learning_rate": 0.0001448476052249637, + "loss": 0.0397, + "step": 2095 + }, + { + "epoch": 0.553687957286929, + "grad_norm": 0.08238353580236435, + "learning_rate": 0.00014471566169679378, + "loss": 0.0513, + "step": 2100 + }, + { + "epoch": 0.555006261947136, + "grad_norm": 0.06317206472158432, + "learning_rate": 0.00014458371816862385, + "loss": 0.0178, + "step": 2105 + }, + { + "epoch": 0.556324566607343, + "grad_norm": 0.0652734637260437, + "learning_rate": 0.0001444517746404539, + "loss": 0.0184, + "step": 2110 + }, + { + "epoch": 0.55764287126755, + "grad_norm": 0.05471858009696007, + "learning_rate": 0.00014431983111228396, + "loss": 0.0089, + "step": 2115 + }, + { + "epoch": 0.558961175927757, + "grad_norm": 0.005062670446932316, + "learning_rate": 0.000144187887584114, + "loss": 0.0052, + "step": 2120 + }, + { + "epoch": 0.5602794805879638, + "grad_norm": 0.06337414681911469, + "learning_rate": 0.00014405594405594407, + "loss": 0.053, + "step": 2125 + }, + { + "epoch": 0.5615977852481708, + "grad_norm": 0.33745357394218445, + "learning_rate": 0.00014392400052777414, + "loss": 0.0166, + "step": 2130 + }, + { + "epoch": 0.5629160899083778, + "grad_norm": 0.7382741570472717, + "learning_rate": 0.00014379205699960418, + "loss": 0.0191, + "step": 2135 + }, + { + "epoch": 0.5642343945685848, + "grad_norm": 0.007551972754299641, + "learning_rate": 0.00014366011347143425, + "loss": 0.0022, + "step": 2140 + }, + { + "epoch": 0.5655526992287918, + "grad_norm": 0.6260896921157837, + "learning_rate": 0.00014352816994326427, + "loss": 0.0095, + "step": 2145 + }, + { + "epoch": 0.5668710038889987, + "grad_norm": 0.11619322001934052, + "learning_rate": 0.00014339622641509434, + "loss": 0.015, + "step": 2150 + }, + { + "epoch": 0.5681893085492057, + "grad_norm": 1.1440670490264893, + "learning_rate": 0.0001432642828869244, + "loss": 0.1343, + "step": 2155 + }, + { + "epoch": 0.5695076132094127, + "grad_norm": 1.1793878078460693, + "learning_rate": 0.00014313233935875445, + "loss": 0.0968, + "step": 2160 + }, + { + "epoch": 0.5708259178696197, + "grad_norm": 0.6865736842155457, + "learning_rate": 0.00014300039583058452, + "loss": 0.0195, + "step": 2165 + }, + { + "epoch": 0.5721442225298267, + "grad_norm": 0.140816792845726, + "learning_rate": 0.00014286845230241456, + "loss": 0.0761, + "step": 2170 + }, + { + "epoch": 0.5734625271900337, + "grad_norm": 0.04071786254644394, + "learning_rate": 0.00014273650877424463, + "loss": 0.0193, + "step": 2175 + }, + { + "epoch": 0.5747808318502405, + "grad_norm": 0.044617727398872375, + "learning_rate": 0.0001426045652460747, + "loss": 0.0112, + "step": 2180 + }, + { + "epoch": 0.5760991365104475, + "grad_norm": 0.11001799255609512, + "learning_rate": 0.00014247262171790474, + "loss": 0.0039, + "step": 2185 + }, + { + "epoch": 0.5774174411706545, + "grad_norm": 0.0036315324250608683, + "learning_rate": 0.0001423406781897348, + "loss": 0.0038, + "step": 2190 + }, + { + "epoch": 0.5787357458308615, + "grad_norm": 0.9866570830345154, + "learning_rate": 0.00014220873466156485, + "loss": 0.025, + "step": 2195 + }, + { + "epoch": 0.5800540504910685, + "grad_norm": 0.023570384830236435, + "learning_rate": 0.00014207679113339492, + "loss": 0.0468, + "step": 2200 + }, + { + "epoch": 0.5813723551512755, + "grad_norm": 0.20010559260845184, + "learning_rate": 0.00014194484760522496, + "loss": 0.0198, + "step": 2205 + }, + { + "epoch": 0.5826906598114824, + "grad_norm": 0.06153270602226257, + "learning_rate": 0.00014181290407705503, + "loss": 0.0764, + "step": 2210 + }, + { + "epoch": 0.5840089644716894, + "grad_norm": 0.033162448555231094, + "learning_rate": 0.0001416809605488851, + "loss": 0.028, + "step": 2215 + }, + { + "epoch": 0.5853272691318964, + "grad_norm": 0.428382933139801, + "learning_rate": 0.00014154901702071514, + "loss": 0.0652, + "step": 2220 + }, + { + "epoch": 0.5866455737921034, + "grad_norm": 0.25004762411117554, + "learning_rate": 0.0001414170734925452, + "loss": 0.0411, + "step": 2225 + }, + { + "epoch": 0.5879638784523104, + "grad_norm": 0.22649863362312317, + "learning_rate": 0.00014128512996437525, + "loss": 0.0517, + "step": 2230 + }, + { + "epoch": 0.5892821831125173, + "grad_norm": 0.035932112485170364, + "learning_rate": 0.00014115318643620532, + "loss": 0.015, + "step": 2235 + }, + { + "epoch": 0.5906004877727242, + "grad_norm": 0.3800172507762909, + "learning_rate": 0.00014102124290803536, + "loss": 0.0324, + "step": 2240 + }, + { + "epoch": 0.5919187924329312, + "grad_norm": 0.6974118947982788, + "learning_rate": 0.0001408892993798654, + "loss": 0.0216, + "step": 2245 + }, + { + "epoch": 0.5932370970931382, + "grad_norm": 0.15472032129764557, + "learning_rate": 0.00014075735585169548, + "loss": 0.0164, + "step": 2250 + }, + { + "epoch": 0.5945554017533452, + "grad_norm": 0.015000814571976662, + "learning_rate": 0.00014062541232352552, + "loss": 0.0395, + "step": 2255 + }, + { + "epoch": 0.5958737064135522, + "grad_norm": 0.052086081355810165, + "learning_rate": 0.0001404934687953556, + "loss": 0.0032, + "step": 2260 + }, + { + "epoch": 0.5971920110737592, + "grad_norm": 0.004600350745022297, + "learning_rate": 0.00014036152526718566, + "loss": 0.0056, + "step": 2265 + }, + { + "epoch": 0.5985103157339661, + "grad_norm": 0.4940958321094513, + "learning_rate": 0.0001402295817390157, + "loss": 0.0206, + "step": 2270 + }, + { + "epoch": 0.5998286203941731, + "grad_norm": 0.09658394008874893, + "learning_rate": 0.00014009763821084577, + "loss": 0.0052, + "step": 2275 + }, + { + "epoch": 0.60114692505438, + "grad_norm": 0.00020539117394946516, + "learning_rate": 0.0001399656946826758, + "loss": 0.087, + "step": 2280 + }, + { + "epoch": 0.602465229714587, + "grad_norm": 0.1871018409729004, + "learning_rate": 0.00013983375115450588, + "loss": 0.0812, + "step": 2285 + }, + { + "epoch": 0.603783534374794, + "grad_norm": 0.02583954855799675, + "learning_rate": 0.00013970180762633592, + "loss": 0.0232, + "step": 2290 + }, + { + "epoch": 0.605101839035001, + "grad_norm": 1.2103784084320068, + "learning_rate": 0.000139569864098166, + "loss": 0.0151, + "step": 2295 + }, + { + "epoch": 0.6064201436952079, + "grad_norm": 0.023514943197369576, + "learning_rate": 0.00013943792056999606, + "loss": 0.0193, + "step": 2300 + }, + { + "epoch": 0.6077384483554149, + "grad_norm": 0.0076395305804908276, + "learning_rate": 0.0001393059770418261, + "loss": 0.0379, + "step": 2305 + }, + { + "epoch": 0.6090567530156219, + "grad_norm": 0.12412039190530777, + "learning_rate": 0.00013917403351365617, + "loss": 0.0095, + "step": 2310 + }, + { + "epoch": 0.6103750576758289, + "grad_norm": 0.021904783323407173, + "learning_rate": 0.0001390420899854862, + "loss": 0.0166, + "step": 2315 + }, + { + "epoch": 0.6116933623360359, + "grad_norm": 0.004012851510196924, + "learning_rate": 0.00013891014645731628, + "loss": 0.0103, + "step": 2320 + }, + { + "epoch": 0.6130116669962429, + "grad_norm": 0.007267913781106472, + "learning_rate": 0.00013877820292914635, + "loss": 0.0708, + "step": 2325 + }, + { + "epoch": 0.6143299716564498, + "grad_norm": 0.10363642126321793, + "learning_rate": 0.0001386462594009764, + "loss": 0.0473, + "step": 2330 + }, + { + "epoch": 0.6156482763166568, + "grad_norm": 0.04899830371141434, + "learning_rate": 0.00013851431587280646, + "loss": 0.0283, + "step": 2335 + }, + { + "epoch": 0.6169665809768637, + "grad_norm": 0.39460498094558716, + "learning_rate": 0.0001383823723446365, + "loss": 0.0597, + "step": 2340 + }, + { + "epoch": 0.6182848856370707, + "grad_norm": 0.04092290997505188, + "learning_rate": 0.00013825042881646655, + "loss": 0.0167, + "step": 2345 + }, + { + "epoch": 0.6196031902972777, + "grad_norm": 0.2781132161617279, + "learning_rate": 0.00013811848528829662, + "loss": 0.0097, + "step": 2350 + }, + { + "epoch": 0.6209214949574847, + "grad_norm": 0.041443537920713425, + "learning_rate": 0.00013798654176012666, + "loss": 0.0226, + "step": 2355 + }, + { + "epoch": 0.6222397996176916, + "grad_norm": 0.1242462694644928, + "learning_rate": 0.00013785459823195673, + "loss": 0.0055, + "step": 2360 + }, + { + "epoch": 0.6235581042778986, + "grad_norm": 0.4440467357635498, + "learning_rate": 0.00013772265470378677, + "loss": 0.049, + "step": 2365 + }, + { + "epoch": 0.6248764089381056, + "grad_norm": 0.014354427345097065, + "learning_rate": 0.00013759071117561684, + "loss": 0.0327, + "step": 2370 + }, + { + "epoch": 0.6261947135983126, + "grad_norm": 0.011539973318576813, + "learning_rate": 0.0001374587676474469, + "loss": 0.0222, + "step": 2375 + }, + { + "epoch": 0.6275130182585196, + "grad_norm": 0.23539051413536072, + "learning_rate": 0.00013732682411927695, + "loss": 0.0816, + "step": 2380 + }, + { + "epoch": 0.6288313229187266, + "grad_norm": 0.26793941855430603, + "learning_rate": 0.00013719488059110702, + "loss": 0.0325, + "step": 2385 + }, + { + "epoch": 0.6301496275789334, + "grad_norm": 0.01662217453122139, + "learning_rate": 0.00013706293706293706, + "loss": 0.0221, + "step": 2390 + }, + { + "epoch": 0.6314679322391404, + "grad_norm": 0.30669671297073364, + "learning_rate": 0.00013693099353476713, + "loss": 0.026, + "step": 2395 + }, + { + "epoch": 0.6327862368993474, + "grad_norm": 0.03350894898176193, + "learning_rate": 0.00013679905000659717, + "loss": 0.0072, + "step": 2400 + }, + { + "epoch": 0.6341045415595544, + "grad_norm": 0.014983875676989555, + "learning_rate": 0.00013666710647842724, + "loss": 0.049, + "step": 2405 + }, + { + "epoch": 0.6354228462197614, + "grad_norm": 1.8989384174346924, + "learning_rate": 0.0001365351629502573, + "loss": 0.0335, + "step": 2410 + }, + { + "epoch": 0.6367411508799684, + "grad_norm": 0.030135562643408775, + "learning_rate": 0.00013640321942208735, + "loss": 0.0051, + "step": 2415 + }, + { + "epoch": 0.6380594555401753, + "grad_norm": 0.02079075388610363, + "learning_rate": 0.00013627127589391742, + "loss": 0.0138, + "step": 2420 + }, + { + "epoch": 0.6393777602003823, + "grad_norm": 0.06065403297543526, + "learning_rate": 0.00013613933236574746, + "loss": 0.0357, + "step": 2425 + }, + { + "epoch": 0.6406960648605893, + "grad_norm": 0.2980937659740448, + "learning_rate": 0.00013600738883757753, + "loss": 0.0138, + "step": 2430 + }, + { + "epoch": 0.6420143695207963, + "grad_norm": 0.4820438623428345, + "learning_rate": 0.00013587544530940758, + "loss": 0.01, + "step": 2435 + }, + { + "epoch": 0.6433326741810033, + "grad_norm": 0.005618259310722351, + "learning_rate": 0.00013574350178123765, + "loss": 0.0052, + "step": 2440 + }, + { + "epoch": 0.6446509788412103, + "grad_norm": 0.7173821926116943, + "learning_rate": 0.0001356115582530677, + "loss": 0.0133, + "step": 2445 + }, + { + "epoch": 0.6459692835014171, + "grad_norm": 0.0053142281249165535, + "learning_rate": 0.00013547961472489773, + "loss": 0.0045, + "step": 2450 + }, + { + "epoch": 0.6472875881616241, + "grad_norm": 0.06118829548358917, + "learning_rate": 0.0001353476711967278, + "loss": 0.056, + "step": 2455 + }, + { + "epoch": 0.6486058928218311, + "grad_norm": 3.5878078937530518, + "learning_rate": 0.00013521572766855787, + "loss": 0.0232, + "step": 2460 + }, + { + "epoch": 0.6499241974820381, + "grad_norm": 0.004911276511847973, + "learning_rate": 0.0001350837841403879, + "loss": 0.0074, + "step": 2465 + }, + { + "epoch": 0.6512425021422451, + "grad_norm": 0.0028026222717016935, + "learning_rate": 0.00013495184061221798, + "loss": 0.0782, + "step": 2470 + }, + { + "epoch": 0.6525608068024521, + "grad_norm": 0.7317615747451782, + "learning_rate": 0.00013481989708404802, + "loss": 0.0222, + "step": 2475 + }, + { + "epoch": 0.653879111462659, + "grad_norm": 0.01835751160979271, + "learning_rate": 0.0001346879535558781, + "loss": 0.0661, + "step": 2480 + }, + { + "epoch": 0.655197416122866, + "grad_norm": 0.03598962351679802, + "learning_rate": 0.00013455601002770813, + "loss": 0.0395, + "step": 2485 + }, + { + "epoch": 0.656515720783073, + "grad_norm": 0.013886351138353348, + "learning_rate": 0.0001344240664995382, + "loss": 0.0156, + "step": 2490 + }, + { + "epoch": 0.65783402544328, + "grad_norm": 5.741530895233154, + "learning_rate": 0.00013429212297136827, + "loss": 0.0317, + "step": 2495 + }, + { + "epoch": 0.659152330103487, + "grad_norm": 0.20793496072292328, + "learning_rate": 0.0001341601794431983, + "loss": 0.0072, + "step": 2500 + }, + { + "epoch": 0.659152330103487, + "eval_loss": 0.0300898440182209, + "eval_runtime": 453.0554, + "eval_samples_per_second": 7.443, + "eval_steps_per_second": 3.721, + "step": 2500 + }, + { + "epoch": 0.6604706347636939, + "grad_norm": 0.03460961952805519, + "learning_rate": 0.00013402823591502838, + "loss": 0.0097, + "step": 2505 + }, + { + "epoch": 0.6617889394239008, + "grad_norm": 0.31785696744918823, + "learning_rate": 0.00013389629238685842, + "loss": 0.0303, + "step": 2510 + }, + { + "epoch": 0.6631072440841078, + "grad_norm": 0.4273851215839386, + "learning_rate": 0.0001337643488586885, + "loss": 0.0499, + "step": 2515 + }, + { + "epoch": 0.6644255487443148, + "grad_norm": 0.02236153744161129, + "learning_rate": 0.00013363240533051856, + "loss": 0.0069, + "step": 2520 + }, + { + "epoch": 0.6657438534045218, + "grad_norm": 0.1592864990234375, + "learning_rate": 0.0001335004618023486, + "loss": 0.0326, + "step": 2525 + }, + { + "epoch": 0.6670621580647288, + "grad_norm": 0.029961545020341873, + "learning_rate": 0.00013336851827417867, + "loss": 0.0178, + "step": 2530 + }, + { + "epoch": 0.6683804627249358, + "grad_norm": 0.03120764158666134, + "learning_rate": 0.00013323657474600872, + "loss": 0.115, + "step": 2535 + }, + { + "epoch": 0.6696987673851427, + "grad_norm": 0.01060028001666069, + "learning_rate": 0.00013310463121783879, + "loss": 0.0036, + "step": 2540 + }, + { + "epoch": 0.6710170720453497, + "grad_norm": 0.053470809012651443, + "learning_rate": 0.00013297268768966883, + "loss": 0.0079, + "step": 2545 + }, + { + "epoch": 0.6723353767055567, + "grad_norm": 0.022777097299695015, + "learning_rate": 0.00013284074416149887, + "loss": 0.0078, + "step": 2550 + }, + { + "epoch": 0.6736536813657636, + "grad_norm": 0.0548521913588047, + "learning_rate": 0.00013270880063332894, + "loss": 0.0503, + "step": 2555 + }, + { + "epoch": 0.6749719860259706, + "grad_norm": 0.02028457075357437, + "learning_rate": 0.00013257685710515898, + "loss": 0.0096, + "step": 2560 + }, + { + "epoch": 0.6762902906861776, + "grad_norm": 0.01569107361137867, + "learning_rate": 0.00013244491357698905, + "loss": 0.008, + "step": 2565 + }, + { + "epoch": 0.6776085953463845, + "grad_norm": 0.00743742985650897, + "learning_rate": 0.00013231297004881912, + "loss": 0.005, + "step": 2570 + }, + { + "epoch": 0.6789269000065915, + "grad_norm": 0.025164416059851646, + "learning_rate": 0.00013218102652064916, + "loss": 0.018, + "step": 2575 + }, + { + "epoch": 0.6802452046667985, + "grad_norm": 0.3653188645839691, + "learning_rate": 0.00013204908299247923, + "loss": 0.0295, + "step": 2580 + }, + { + "epoch": 0.6815635093270055, + "grad_norm": 0.685422956943512, + "learning_rate": 0.00013191713946430927, + "loss": 0.0335, + "step": 2585 + }, + { + "epoch": 0.6828818139872125, + "grad_norm": 0.675740122795105, + "learning_rate": 0.00013178519593613934, + "loss": 0.0592, + "step": 2590 + }, + { + "epoch": 0.6842001186474194, + "grad_norm": 0.10513252764940262, + "learning_rate": 0.00013165325240796938, + "loss": 0.0353, + "step": 2595 + }, + { + "epoch": 0.6855184233076264, + "grad_norm": 0.43512973189353943, + "learning_rate": 0.00013152130887979945, + "loss": 0.0142, + "step": 2600 + }, + { + "epoch": 0.6868367279678333, + "grad_norm": 0.029436839744448662, + "learning_rate": 0.00013138936535162952, + "loss": 0.0042, + "step": 2605 + }, + { + "epoch": 0.6881550326280403, + "grad_norm": 0.5607122778892517, + "learning_rate": 0.00013125742182345957, + "loss": 0.0184, + "step": 2610 + }, + { + "epoch": 0.6894733372882473, + "grad_norm": 0.11365406215190887, + "learning_rate": 0.00013112547829528963, + "loss": 0.006, + "step": 2615 + }, + { + "epoch": 0.6907916419484543, + "grad_norm": 0.047227244824171066, + "learning_rate": 0.00013099353476711968, + "loss": 0.008, + "step": 2620 + }, + { + "epoch": 0.6921099466086612, + "grad_norm": 0.0005877618095837533, + "learning_rate": 0.00013086159123894975, + "loss": 0.0286, + "step": 2625 + }, + { + "epoch": 0.6934282512688682, + "grad_norm": 0.010759112425148487, + "learning_rate": 0.0001307296477107798, + "loss": 0.0062, + "step": 2630 + }, + { + "epoch": 0.6947465559290752, + "grad_norm": 0.07117745280265808, + "learning_rate": 0.00013059770418260986, + "loss": 0.0891, + "step": 2635 + }, + { + "epoch": 0.6960648605892822, + "grad_norm": 0.0639057606458664, + "learning_rate": 0.00013046576065443993, + "loss": 0.0072, + "step": 2640 + }, + { + "epoch": 0.6973831652494892, + "grad_norm": 0.027350090444087982, + "learning_rate": 0.00013033381712626994, + "loss": 0.0103, + "step": 2645 + }, + { + "epoch": 0.6987014699096962, + "grad_norm": 0.015336195938289165, + "learning_rate": 0.0001302018735981, + "loss": 0.0041, + "step": 2650 + }, + { + "epoch": 0.700019774569903, + "grad_norm": 1.0650830268859863, + "learning_rate": 0.00013006993006993008, + "loss": 0.0443, + "step": 2655 + }, + { + "epoch": 0.70133807923011, + "grad_norm": 0.019073212519288063, + "learning_rate": 0.00012993798654176012, + "loss": 0.0331, + "step": 2660 + }, + { + "epoch": 0.702656383890317, + "grad_norm": 0.10109209269285202, + "learning_rate": 0.0001298060430135902, + "loss": 0.0054, + "step": 2665 + }, + { + "epoch": 0.703974688550524, + "grad_norm": 0.03528957813978195, + "learning_rate": 0.00012967409948542023, + "loss": 0.0427, + "step": 2670 + }, + { + "epoch": 0.705292993210731, + "grad_norm": 0.03577788919210434, + "learning_rate": 0.0001295421559572503, + "loss": 0.023, + "step": 2675 + }, + { + "epoch": 0.706611297870938, + "grad_norm": 0.5576180815696716, + "learning_rate": 0.00012941021242908034, + "loss": 0.0416, + "step": 2680 + }, + { + "epoch": 0.7079296025311449, + "grad_norm": 0.017131298780441284, + "learning_rate": 0.0001292782689009104, + "loss": 0.0235, + "step": 2685 + }, + { + "epoch": 0.7092479071913519, + "grad_norm": 0.8517888784408569, + "learning_rate": 0.00012914632537274048, + "loss": 0.0168, + "step": 2690 + }, + { + "epoch": 0.7105662118515589, + "grad_norm": 0.23812156915664673, + "learning_rate": 0.00012901438184457052, + "loss": 0.0483, + "step": 2695 + }, + { + "epoch": 0.7118845165117659, + "grad_norm": 0.11746613681316376, + "learning_rate": 0.0001288824383164006, + "loss": 0.0255, + "step": 2700 + }, + { + "epoch": 0.7132028211719729, + "grad_norm": 0.20089928805828094, + "learning_rate": 0.00012875049478823064, + "loss": 0.0267, + "step": 2705 + }, + { + "epoch": 0.7145211258321799, + "grad_norm": 0.8301129937171936, + "learning_rate": 0.0001286185512600607, + "loss": 0.016, + "step": 2710 + }, + { + "epoch": 0.7158394304923867, + "grad_norm": 0.01838674768805504, + "learning_rate": 0.00012848660773189077, + "loss": 0.0229, + "step": 2715 + }, + { + "epoch": 0.7171577351525937, + "grad_norm": 0.03670337051153183, + "learning_rate": 0.00012835466420372082, + "loss": 0.038, + "step": 2720 + }, + { + "epoch": 0.7184760398128007, + "grad_norm": 0.0452633760869503, + "learning_rate": 0.00012822272067555089, + "loss": 0.0622, + "step": 2725 + }, + { + "epoch": 0.7197943444730077, + "grad_norm": 0.09503110498189926, + "learning_rate": 0.00012809077714738093, + "loss": 0.0209, + "step": 2730 + }, + { + "epoch": 0.7211126491332147, + "grad_norm": 1.0327308177947998, + "learning_rate": 0.000127958833619211, + "loss": 0.0361, + "step": 2735 + }, + { + "epoch": 0.7224309537934217, + "grad_norm": 1.0049290657043457, + "learning_rate": 0.00012782689009104104, + "loss": 0.0365, + "step": 2740 + }, + { + "epoch": 0.7237492584536286, + "grad_norm": 0.029774073511362076, + "learning_rate": 0.00012769494656287108, + "loss": 0.0257, + "step": 2745 + }, + { + "epoch": 0.7250675631138356, + "grad_norm": 0.20974040031433105, + "learning_rate": 0.00012756300303470115, + "loss": 0.0542, + "step": 2750 + }, + { + "epoch": 0.7263858677740426, + "grad_norm": 0.8153854608535767, + "learning_rate": 0.0001274310595065312, + "loss": 0.0216, + "step": 2755 + }, + { + "epoch": 0.7277041724342496, + "grad_norm": 0.4393698573112488, + "learning_rate": 0.00012729911597836126, + "loss": 0.0451, + "step": 2760 + }, + { + "epoch": 0.7290224770944566, + "grad_norm": 0.06990349292755127, + "learning_rate": 0.00012716717245019133, + "loss": 0.03, + "step": 2765 + }, + { + "epoch": 0.7303407817546635, + "grad_norm": 0.32689470052719116, + "learning_rate": 0.00012703522892202137, + "loss": 0.0263, + "step": 2770 + }, + { + "epoch": 0.7316590864148704, + "grad_norm": 0.026600876823067665, + "learning_rate": 0.00012690328539385144, + "loss": 0.0404, + "step": 2775 + }, + { + "epoch": 0.7329773910750774, + "grad_norm": 0.11228257417678833, + "learning_rate": 0.00012677134186568148, + "loss": 0.0224, + "step": 2780 + }, + { + "epoch": 0.7342956957352844, + "grad_norm": 0.6469443440437317, + "learning_rate": 0.00012663939833751155, + "loss": 0.0178, + "step": 2785 + }, + { + "epoch": 0.7356140003954914, + "grad_norm": 0.020773250609636307, + "learning_rate": 0.0001265074548093416, + "loss": 0.011, + "step": 2790 + }, + { + "epoch": 0.7369323050556984, + "grad_norm": 0.7378728985786438, + "learning_rate": 0.00012637551128117167, + "loss": 0.0227, + "step": 2795 + }, + { + "epoch": 0.7382506097159054, + "grad_norm": 0.008189595304429531, + "learning_rate": 0.00012624356775300173, + "loss": 0.0892, + "step": 2800 + }, + { + "epoch": 0.7395689143761123, + "grad_norm": 0.031633853912353516, + "learning_rate": 0.00012611162422483178, + "loss": 0.0093, + "step": 2805 + }, + { + "epoch": 0.7408872190363193, + "grad_norm": 0.5078475475311279, + "learning_rate": 0.00012597968069666185, + "loss": 0.0567, + "step": 2810 + }, + { + "epoch": 0.7422055236965263, + "grad_norm": 0.21766887605190277, + "learning_rate": 0.0001258477371684919, + "loss": 0.0485, + "step": 2815 + }, + { + "epoch": 0.7435238283567333, + "grad_norm": 0.3029612898826599, + "learning_rate": 0.00012571579364032196, + "loss": 0.032, + "step": 2820 + }, + { + "epoch": 0.7448421330169402, + "grad_norm": 1.2135159969329834, + "learning_rate": 0.00012558385011215203, + "loss": 0.0139, + "step": 2825 + }, + { + "epoch": 0.7461604376771472, + "grad_norm": 0.016875172033905983, + "learning_rate": 0.00012545190658398207, + "loss": 0.0323, + "step": 2830 + }, + { + "epoch": 0.7474787423373541, + "grad_norm": 0.08923230320215225, + "learning_rate": 0.00012531996305581214, + "loss": 0.0343, + "step": 2835 + }, + { + "epoch": 0.7487970469975611, + "grad_norm": 0.2958766520023346, + "learning_rate": 0.00012518801952764215, + "loss": 0.0431, + "step": 2840 + }, + { + "epoch": 0.7501153516577681, + "grad_norm": 0.7344386577606201, + "learning_rate": 0.00012505607599947222, + "loss": 0.0389, + "step": 2845 + }, + { + "epoch": 0.7514336563179751, + "grad_norm": 0.03681635856628418, + "learning_rate": 0.0001249241324713023, + "loss": 0.0258, + "step": 2850 + }, + { + "epoch": 0.7527519609781821, + "grad_norm": 0.22866861522197723, + "learning_rate": 0.00012479218894313233, + "loss": 0.0223, + "step": 2855 + }, + { + "epoch": 0.7540702656383891, + "grad_norm": 0.029770435765385628, + "learning_rate": 0.0001246602454149624, + "loss": 0.0205, + "step": 2860 + }, + { + "epoch": 0.755388570298596, + "grad_norm": 0.011845707893371582, + "learning_rate": 0.00012452830188679244, + "loss": 0.0252, + "step": 2865 + }, + { + "epoch": 0.756706874958803, + "grad_norm": 0.06696149706840515, + "learning_rate": 0.00012439635835862251, + "loss": 0.0166, + "step": 2870 + }, + { + "epoch": 0.75802517961901, + "grad_norm": 0.01653144136071205, + "learning_rate": 0.00012426441483045256, + "loss": 0.0487, + "step": 2875 + }, + { + "epoch": 0.7593434842792169, + "grad_norm": 0.031312476843595505, + "learning_rate": 0.00012413247130228263, + "loss": 0.0155, + "step": 2880 + }, + { + "epoch": 0.7606617889394239, + "grad_norm": 0.011625733226537704, + "learning_rate": 0.0001240005277741127, + "loss": 0.0333, + "step": 2885 + }, + { + "epoch": 0.7619800935996309, + "grad_norm": 0.012089414522051811, + "learning_rate": 0.00012386858424594274, + "loss": 0.003, + "step": 2890 + }, + { + "epoch": 0.7632983982598378, + "grad_norm": 0.3012307584285736, + "learning_rate": 0.0001237366407177728, + "loss": 0.0172, + "step": 2895 + }, + { + "epoch": 0.7646167029200448, + "grad_norm": 0.31575000286102295, + "learning_rate": 0.00012360469718960285, + "loss": 0.0409, + "step": 2900 + }, + { + "epoch": 0.7659350075802518, + "grad_norm": 0.009794364683330059, + "learning_rate": 0.00012347275366143292, + "loss": 0.0214, + "step": 2905 + }, + { + "epoch": 0.7672533122404588, + "grad_norm": 0.5973085165023804, + "learning_rate": 0.00012334081013326299, + "loss": 0.0245, + "step": 2910 + }, + { + "epoch": 0.7685716169006658, + "grad_norm": 0.019750040024518967, + "learning_rate": 0.00012320886660509303, + "loss": 0.0063, + "step": 2915 + }, + { + "epoch": 0.7698899215608728, + "grad_norm": 0.06402858346700668, + "learning_rate": 0.0001230769230769231, + "loss": 0.0444, + "step": 2920 + }, + { + "epoch": 0.7712082262210797, + "grad_norm": 0.02876671403646469, + "learning_rate": 0.00012294497954875314, + "loss": 0.0103, + "step": 2925 + }, + { + "epoch": 0.7725265308812866, + "grad_norm": 0.6962207555770874, + "learning_rate": 0.0001228130360205832, + "loss": 0.0318, + "step": 2930 + }, + { + "epoch": 0.7738448355414936, + "grad_norm": 0.006536522414535284, + "learning_rate": 0.00012268109249241325, + "loss": 0.0096, + "step": 2935 + }, + { + "epoch": 0.7751631402017006, + "grad_norm": 0.07097168266773224, + "learning_rate": 0.0001225491489642433, + "loss": 0.0174, + "step": 2940 + }, + { + "epoch": 0.7764814448619076, + "grad_norm": 0.042360126972198486, + "learning_rate": 0.00012241720543607336, + "loss": 0.0158, + "step": 2945 + }, + { + "epoch": 0.7777997495221146, + "grad_norm": 0.01159572321921587, + "learning_rate": 0.0001222852619079034, + "loss": 0.0265, + "step": 2950 + }, + { + "epoch": 0.7791180541823215, + "grad_norm": 0.38408163189888, + "learning_rate": 0.00012215331837973347, + "loss": 0.0233, + "step": 2955 + }, + { + "epoch": 0.7804363588425285, + "grad_norm": 0.15588605403900146, + "learning_rate": 0.00012202137485156353, + "loss": 0.0041, + "step": 2960 + }, + { + "epoch": 0.7817546635027355, + "grad_norm": 0.006892362609505653, + "learning_rate": 0.00012188943132339358, + "loss": 0.0026, + "step": 2965 + }, + { + "epoch": 0.7830729681629425, + "grad_norm": 0.030915727838873863, + "learning_rate": 0.00012175748779522364, + "loss": 0.0028, + "step": 2970 + }, + { + "epoch": 0.7843912728231495, + "grad_norm": 0.8151025772094727, + "learning_rate": 0.00012162554426705371, + "loss": 0.0429, + "step": 2975 + }, + { + "epoch": 0.7857095774833565, + "grad_norm": 0.6765475273132324, + "learning_rate": 0.00012149360073888377, + "loss": 0.0319, + "step": 2980 + }, + { + "epoch": 0.7870278821435633, + "grad_norm": 0.054469238966703415, + "learning_rate": 0.00012136165721071382, + "loss": 0.0413, + "step": 2985 + }, + { + "epoch": 0.7883461868037703, + "grad_norm": 0.045610666275024414, + "learning_rate": 0.00012122971368254388, + "loss": 0.0521, + "step": 2990 + }, + { + "epoch": 0.7896644914639773, + "grad_norm": 0.4222470223903656, + "learning_rate": 0.00012109777015437393, + "loss": 0.0846, + "step": 2995 + }, + { + "epoch": 0.7909827961241843, + "grad_norm": 0.0272397268563509, + "learning_rate": 0.00012096582662620399, + "loss": 0.0364, + "step": 3000 + }, + { + "epoch": 0.7909827961241843, + "eval_loss": 0.033312585204839706, + "eval_runtime": 452.2552, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 3.728, + "step": 3000 + }, + { + "epoch": 0.7923011007843913, + "grad_norm": 0.08674059063196182, + "learning_rate": 0.00012083388309803406, + "loss": 0.0081, + "step": 3005 + }, + { + "epoch": 0.7936194054445982, + "grad_norm": 0.21960832178592682, + "learning_rate": 0.00012070193956986411, + "loss": 0.0468, + "step": 3010 + }, + { + "epoch": 0.7949377101048052, + "grad_norm": 0.11259289085865021, + "learning_rate": 0.00012056999604169417, + "loss": 0.0124, + "step": 3015 + }, + { + "epoch": 0.7962560147650122, + "grad_norm": 0.02945362776517868, + "learning_rate": 0.00012043805251352422, + "loss": 0.0298, + "step": 3020 + }, + { + "epoch": 0.7975743194252192, + "grad_norm": 0.27889615297317505, + "learning_rate": 0.00012030610898535428, + "loss": 0.0251, + "step": 3025 + }, + { + "epoch": 0.7988926240854262, + "grad_norm": 0.05873241275548935, + "learning_rate": 0.00012017416545718434, + "loss": 0.0132, + "step": 3030 + }, + { + "epoch": 0.8002109287456332, + "grad_norm": 0.1570046991109848, + "learning_rate": 0.00012004222192901439, + "loss": 0.0228, + "step": 3035 + }, + { + "epoch": 0.80152923340584, + "grad_norm": 0.12575332820415497, + "learning_rate": 0.00011991027840084443, + "loss": 0.0049, + "step": 3040 + }, + { + "epoch": 0.802847538066047, + "grad_norm": 0.8416435122489929, + "learning_rate": 0.00011977833487267449, + "loss": 0.0542, + "step": 3045 + }, + { + "epoch": 0.804165842726254, + "grad_norm": 0.2605098485946655, + "learning_rate": 0.00011964639134450454, + "loss": 0.0084, + "step": 3050 + }, + { + "epoch": 0.805484147386461, + "grad_norm": 0.8996294736862183, + "learning_rate": 0.00011951444781633461, + "loss": 0.0442, + "step": 3055 + }, + { + "epoch": 0.806802452046668, + "grad_norm": 2.7525105476379395, + "learning_rate": 0.00011938250428816467, + "loss": 0.0642, + "step": 3060 + }, + { + "epoch": 0.808120756706875, + "grad_norm": 0.14955930411815643, + "learning_rate": 0.00011925056075999473, + "loss": 0.0384, + "step": 3065 + }, + { + "epoch": 0.8094390613670819, + "grad_norm": 0.018756115809082985, + "learning_rate": 0.00011911861723182478, + "loss": 0.0154, + "step": 3070 + }, + { + "epoch": 0.8107573660272889, + "grad_norm": 0.23998615145683289, + "learning_rate": 0.00011898667370365484, + "loss": 0.0413, + "step": 3075 + }, + { + "epoch": 0.8120756706874959, + "grad_norm": 0.27253249287605286, + "learning_rate": 0.00011885473017548489, + "loss": 0.0081, + "step": 3080 + }, + { + "epoch": 0.8133939753477029, + "grad_norm": 0.2925993502140045, + "learning_rate": 0.00011872278664731495, + "loss": 0.0332, + "step": 3085 + }, + { + "epoch": 0.8147122800079099, + "grad_norm": 0.5364832878112793, + "learning_rate": 0.00011859084311914502, + "loss": 0.0143, + "step": 3090 + }, + { + "epoch": 0.8160305846681168, + "grad_norm": 0.32104921340942383, + "learning_rate": 0.00011845889959097507, + "loss": 0.0216, + "step": 3095 + }, + { + "epoch": 0.8173488893283237, + "grad_norm": 0.0205856766551733, + "learning_rate": 0.00011832695606280513, + "loss": 0.0346, + "step": 3100 + }, + { + "epoch": 0.8186671939885307, + "grad_norm": 0.2541547417640686, + "learning_rate": 0.00011819501253463518, + "loss": 0.0793, + "step": 3105 + }, + { + "epoch": 0.8199854986487377, + "grad_norm": 0.08333491533994675, + "learning_rate": 0.00011806306900646524, + "loss": 0.0049, + "step": 3110 + }, + { + "epoch": 0.8213038033089447, + "grad_norm": 0.0355968177318573, + "learning_rate": 0.0001179311254782953, + "loss": 0.0051, + "step": 3115 + }, + { + "epoch": 0.8226221079691517, + "grad_norm": 0.06948401033878326, + "learning_rate": 0.00011779918195012536, + "loss": 0.013, + "step": 3120 + }, + { + "epoch": 0.8239404126293587, + "grad_norm": 0.03328891843557358, + "learning_rate": 0.00011766723842195542, + "loss": 0.0122, + "step": 3125 + }, + { + "epoch": 0.8252587172895656, + "grad_norm": 0.013782350346446037, + "learning_rate": 0.00011753529489378548, + "loss": 0.0073, + "step": 3130 + }, + { + "epoch": 0.8265770219497726, + "grad_norm": 0.024390392005443573, + "learning_rate": 0.00011740335136561553, + "loss": 0.0143, + "step": 3135 + }, + { + "epoch": 0.8278953266099796, + "grad_norm": 0.002548128366470337, + "learning_rate": 0.00011727140783744557, + "loss": 0.0027, + "step": 3140 + }, + { + "epoch": 0.8292136312701865, + "grad_norm": 0.11674848943948746, + "learning_rate": 0.00011713946430927563, + "loss": 0.0253, + "step": 3145 + }, + { + "epoch": 0.8305319359303935, + "grad_norm": 0.005774884019047022, + "learning_rate": 0.00011700752078110568, + "loss": 0.0018, + "step": 3150 + }, + { + "epoch": 0.8318502405906005, + "grad_norm": 0.5763069987297058, + "learning_rate": 0.00011687557725293574, + "loss": 0.0119, + "step": 3155 + }, + { + "epoch": 0.8331685452508074, + "grad_norm": 0.0027607593219727278, + "learning_rate": 0.0001167436337247658, + "loss": 0.0279, + "step": 3160 + }, + { + "epoch": 0.8344868499110144, + "grad_norm": 1.859642505645752, + "learning_rate": 0.00011661169019659585, + "loss": 0.0228, + "step": 3165 + }, + { + "epoch": 0.8358051545712214, + "grad_norm": 0.16597022116184235, + "learning_rate": 0.00011647974666842592, + "loss": 0.1228, + "step": 3170 + }, + { + "epoch": 0.8371234592314284, + "grad_norm": 0.33833742141723633, + "learning_rate": 0.00011634780314025598, + "loss": 0.073, + "step": 3175 + }, + { + "epoch": 0.8384417638916354, + "grad_norm": 0.024682912975549698, + "learning_rate": 0.00011621585961208603, + "loss": 0.0042, + "step": 3180 + }, + { + "epoch": 0.8397600685518424, + "grad_norm": 0.05926942452788353, + "learning_rate": 0.00011608391608391609, + "loss": 0.0066, + "step": 3185 + }, + { + "epoch": 0.8410783732120493, + "grad_norm": 0.1414029747247696, + "learning_rate": 0.00011595197255574614, + "loss": 0.0603, + "step": 3190 + }, + { + "epoch": 0.8423966778722562, + "grad_norm": 0.37928736209869385, + "learning_rate": 0.0001158200290275762, + "loss": 0.0266, + "step": 3195 + }, + { + "epoch": 0.8437149825324632, + "grad_norm": 0.018329354003071785, + "learning_rate": 0.00011568808549940627, + "loss": 0.0047, + "step": 3200 + }, + { + "epoch": 0.8450332871926702, + "grad_norm": 0.2993735373020172, + "learning_rate": 0.00011555614197123632, + "loss": 0.0218, + "step": 3205 + }, + { + "epoch": 0.8463515918528772, + "grad_norm": 0.1767728328704834, + "learning_rate": 0.00011542419844306638, + "loss": 0.0363, + "step": 3210 + }, + { + "epoch": 0.8476698965130842, + "grad_norm": 0.39774414896965027, + "learning_rate": 0.00011529225491489644, + "loss": 0.0506, + "step": 3215 + }, + { + "epoch": 0.8489882011732911, + "grad_norm": 0.021896762773394585, + "learning_rate": 0.00011516031138672649, + "loss": 0.0081, + "step": 3220 + }, + { + "epoch": 0.8503065058334981, + "grad_norm": 0.358372300863266, + "learning_rate": 0.00011502836785855655, + "loss": 0.0224, + "step": 3225 + }, + { + "epoch": 0.8516248104937051, + "grad_norm": 0.01605542004108429, + "learning_rate": 0.00011489642433038662, + "loss": 0.0215, + "step": 3230 + }, + { + "epoch": 0.8529431151539121, + "grad_norm": 0.021189266815781593, + "learning_rate": 0.00011476448080221667, + "loss": 0.0051, + "step": 3235 + }, + { + "epoch": 0.8542614198141191, + "grad_norm": 0.013394076377153397, + "learning_rate": 0.0001146325372740467, + "loss": 0.021, + "step": 3240 + }, + { + "epoch": 0.8555797244743261, + "grad_norm": 0.19848507642745972, + "learning_rate": 0.00011450059374587676, + "loss": 0.0285, + "step": 3245 + }, + { + "epoch": 0.856898029134533, + "grad_norm": 0.2463046759366989, + "learning_rate": 0.00011436865021770683, + "loss": 0.0384, + "step": 3250 + }, + { + "epoch": 0.8582163337947399, + "grad_norm": 0.37432390451431274, + "learning_rate": 0.00011423670668953688, + "loss": 0.0098, + "step": 3255 + }, + { + "epoch": 0.8595346384549469, + "grad_norm": 0.060943394899368286, + "learning_rate": 0.00011410476316136694, + "loss": 0.0087, + "step": 3260 + }, + { + "epoch": 0.8608529431151539, + "grad_norm": 0.2846696674823761, + "learning_rate": 0.00011397281963319699, + "loss": 0.0148, + "step": 3265 + }, + { + "epoch": 0.8621712477753609, + "grad_norm": 0.009311323054134846, + "learning_rate": 0.00011384087610502705, + "loss": 0.0024, + "step": 3270 + }, + { + "epoch": 0.8634895524355679, + "grad_norm": 0.046277035027742386, + "learning_rate": 0.0001137089325768571, + "loss": 0.0274, + "step": 3275 + }, + { + "epoch": 0.8648078570957748, + "grad_norm": 0.006024620030075312, + "learning_rate": 0.00011357698904868716, + "loss": 0.0286, + "step": 3280 + }, + { + "epoch": 0.8661261617559818, + "grad_norm": 0.033578380942344666, + "learning_rate": 0.00011344504552051723, + "loss": 0.0153, + "step": 3285 + }, + { + "epoch": 0.8674444664161888, + "grad_norm": 0.8537917137145996, + "learning_rate": 0.00011331310199234728, + "loss": 0.0304, + "step": 3290 + }, + { + "epoch": 0.8687627710763958, + "grad_norm": 0.013933337293565273, + "learning_rate": 0.00011318115846417734, + "loss": 0.0112, + "step": 3295 + }, + { + "epoch": 0.8700810757366028, + "grad_norm": 0.35437721014022827, + "learning_rate": 0.0001130492149360074, + "loss": 0.0228, + "step": 3300 + }, + { + "epoch": 0.8713993803968098, + "grad_norm": 1.3024121522903442, + "learning_rate": 0.00011291727140783745, + "loss": 0.0203, + "step": 3305 + }, + { + "epoch": 0.8727176850570166, + "grad_norm": 0.5131255984306335, + "learning_rate": 0.00011278532787966751, + "loss": 0.0181, + "step": 3310 + }, + { + "epoch": 0.8740359897172236, + "grad_norm": 0.039366886019706726, + "learning_rate": 0.00011265338435149758, + "loss": 0.0192, + "step": 3315 + }, + { + "epoch": 0.8753542943774306, + "grad_norm": 0.13679669797420502, + "learning_rate": 0.00011252144082332763, + "loss": 0.004, + "step": 3320 + }, + { + "epoch": 0.8766725990376376, + "grad_norm": 0.003076886525377631, + "learning_rate": 0.00011238949729515769, + "loss": 0.0405, + "step": 3325 + }, + { + "epoch": 0.8779909036978446, + "grad_norm": 0.019953785464167595, + "learning_rate": 0.00011225755376698774, + "loss": 0.0241, + "step": 3330 + }, + { + "epoch": 0.8793092083580516, + "grad_norm": 0.007980377413332462, + "learning_rate": 0.0001121256102388178, + "loss": 0.0064, + "step": 3335 + }, + { + "epoch": 0.8806275130182585, + "grad_norm": 0.018761295825242996, + "learning_rate": 0.00011199366671064784, + "loss": 0.0032, + "step": 3340 + }, + { + "epoch": 0.8819458176784655, + "grad_norm": 0.022511709481477737, + "learning_rate": 0.0001118617231824779, + "loss": 0.0055, + "step": 3345 + }, + { + "epoch": 0.8832641223386725, + "grad_norm": 0.021270718425512314, + "learning_rate": 0.00011172977965430795, + "loss": 0.033, + "step": 3350 + }, + { + "epoch": 0.8845824269988795, + "grad_norm": 0.02710561640560627, + "learning_rate": 0.00011159783612613801, + "loss": 0.0094, + "step": 3355 + }, + { + "epoch": 0.8859007316590864, + "grad_norm": 0.4353378117084503, + "learning_rate": 0.00011146589259796806, + "loss": 0.0089, + "step": 3360 + }, + { + "epoch": 0.8872190363192934, + "grad_norm": 0.0257766991853714, + "learning_rate": 0.00011133394906979813, + "loss": 0.0059, + "step": 3365 + }, + { + "epoch": 0.8885373409795003, + "grad_norm": 0.80838942527771, + "learning_rate": 0.00011120200554162819, + "loss": 0.0263, + "step": 3370 + }, + { + "epoch": 0.8898556456397073, + "grad_norm": 0.007799761835485697, + "learning_rate": 0.00011107006201345824, + "loss": 0.0028, + "step": 3375 + }, + { + "epoch": 0.8911739502999143, + "grad_norm": 0.007315775845199823, + "learning_rate": 0.0001109381184852883, + "loss": 0.0127, + "step": 3380 + }, + { + "epoch": 0.8924922549601213, + "grad_norm": 1.4861233234405518, + "learning_rate": 0.00011080617495711836, + "loss": 0.0562, + "step": 3385 + }, + { + "epoch": 0.8938105596203283, + "grad_norm": 0.010219530202448368, + "learning_rate": 0.00011067423142894841, + "loss": 0.0438, + "step": 3390 + }, + { + "epoch": 0.8951288642805353, + "grad_norm": 1.0191857814788818, + "learning_rate": 0.00011054228790077848, + "loss": 0.0493, + "step": 3395 + }, + { + "epoch": 0.8964471689407422, + "grad_norm": 0.01459536887705326, + "learning_rate": 0.00011041034437260854, + "loss": 0.0117, + "step": 3400 + }, + { + "epoch": 0.8977654736009492, + "grad_norm": 0.008682495914399624, + "learning_rate": 0.00011027840084443859, + "loss": 0.02, + "step": 3405 + }, + { + "epoch": 0.8990837782611562, + "grad_norm": 0.02197263017296791, + "learning_rate": 0.00011014645731626865, + "loss": 0.0454, + "step": 3410 + }, + { + "epoch": 0.9004020829213631, + "grad_norm": 0.01436714269220829, + "learning_rate": 0.0001100145137880987, + "loss": 0.0283, + "step": 3415 + }, + { + "epoch": 0.9017203875815701, + "grad_norm": 0.14327946305274963, + "learning_rate": 0.00010988257025992876, + "loss": 0.0461, + "step": 3420 + }, + { + "epoch": 0.9030386922417771, + "grad_norm": 1.671773910522461, + "learning_rate": 0.00010975062673175883, + "loss": 0.054, + "step": 3425 + }, + { + "epoch": 0.904356996901984, + "grad_norm": 0.009926804341375828, + "learning_rate": 0.00010961868320358888, + "loss": 0.0429, + "step": 3430 + }, + { + "epoch": 0.905675301562191, + "grad_norm": 0.554020881652832, + "learning_rate": 0.00010948673967541894, + "loss": 0.0618, + "step": 3435 + }, + { + "epoch": 0.906993606222398, + "grad_norm": 0.1399248093366623, + "learning_rate": 0.00010935479614724897, + "loss": 0.0229, + "step": 3440 + }, + { + "epoch": 0.908311910882605, + "grad_norm": 0.02739197015762329, + "learning_rate": 0.00010922285261907904, + "loss": 0.0082, + "step": 3445 + }, + { + "epoch": 0.909630215542812, + "grad_norm": 0.33394527435302734, + "learning_rate": 0.00010909090909090909, + "loss": 0.0403, + "step": 3450 + }, + { + "epoch": 0.9109485202030189, + "grad_norm": 0.08083894103765488, + "learning_rate": 0.00010895896556273915, + "loss": 0.0406, + "step": 3455 + }, + { + "epoch": 0.9122668248632259, + "grad_norm": 0.39336663484573364, + "learning_rate": 0.0001088270220345692, + "loss": 0.02, + "step": 3460 + }, + { + "epoch": 0.9135851295234328, + "grad_norm": 0.20481553673744202, + "learning_rate": 0.00010869507850639926, + "loss": 0.0221, + "step": 3465 + }, + { + "epoch": 0.9149034341836398, + "grad_norm": 1.4507408142089844, + "learning_rate": 0.00010856313497822932, + "loss": 0.0357, + "step": 3470 + }, + { + "epoch": 0.9162217388438468, + "grad_norm": 0.2678806483745575, + "learning_rate": 0.00010843119145005937, + "loss": 0.0181, + "step": 3475 + }, + { + "epoch": 0.9175400435040538, + "grad_norm": 0.007361674215644598, + "learning_rate": 0.00010829924792188944, + "loss": 0.0978, + "step": 3480 + }, + { + "epoch": 0.9188583481642607, + "grad_norm": 0.773695707321167, + "learning_rate": 0.0001081673043937195, + "loss": 0.0401, + "step": 3485 + }, + { + "epoch": 0.9201766528244677, + "grad_norm": 0.0010772625682875514, + "learning_rate": 0.00010803536086554955, + "loss": 0.0233, + "step": 3490 + }, + { + "epoch": 0.9214949574846747, + "grad_norm": 0.08971104770898819, + "learning_rate": 0.00010790341733737961, + "loss": 0.0319, + "step": 3495 + }, + { + "epoch": 0.9228132621448817, + "grad_norm": 0.21372731029987335, + "learning_rate": 0.00010777147380920966, + "loss": 0.0315, + "step": 3500 + }, + { + "epoch": 0.9228132621448817, + "eval_loss": 0.02952708676457405, + "eval_runtime": 451.5837, + "eval_samples_per_second": 7.467, + "eval_steps_per_second": 3.734, + "step": 3500 + }, + { + "epoch": 0.9241315668050887, + "grad_norm": 0.016639264300465584, + "learning_rate": 0.00010763953028103972, + "loss": 0.0125, + "step": 3505 + }, + { + "epoch": 0.9254498714652957, + "grad_norm": 0.46340492367744446, + "learning_rate": 0.00010750758675286979, + "loss": 0.0186, + "step": 3510 + }, + { + "epoch": 0.9267681761255026, + "grad_norm": 0.01847526989877224, + "learning_rate": 0.00010737564322469984, + "loss": 0.0026, + "step": 3515 + }, + { + "epoch": 0.9280864807857095, + "grad_norm": 0.5947860479354858, + "learning_rate": 0.0001072436996965299, + "loss": 0.0259, + "step": 3520 + }, + { + "epoch": 0.9294047854459165, + "grad_norm": 0.06145291402935982, + "learning_rate": 0.00010711175616835995, + "loss": 0.0057, + "step": 3525 + }, + { + "epoch": 0.9307230901061235, + "grad_norm": 0.0143959429115057, + "learning_rate": 0.00010697981264019001, + "loss": 0.0145, + "step": 3530 + }, + { + "epoch": 0.9320413947663305, + "grad_norm": 0.21143831312656403, + "learning_rate": 0.00010684786911202007, + "loss": 0.0459, + "step": 3535 + }, + { + "epoch": 0.9333596994265375, + "grad_norm": 0.02548077143728733, + "learning_rate": 0.00010671592558385011, + "loss": 0.0051, + "step": 3540 + }, + { + "epoch": 0.9346780040867444, + "grad_norm": 0.008077048696577549, + "learning_rate": 0.00010658398205568016, + "loss": 0.0306, + "step": 3545 + }, + { + "epoch": 0.9359963087469514, + "grad_norm": 0.0030760422814637423, + "learning_rate": 0.00010645203852751022, + "loss": 0.0575, + "step": 3550 + }, + { + "epoch": 0.9373146134071584, + "grad_norm": 0.18114158511161804, + "learning_rate": 0.00010632009499934027, + "loss": 0.0885, + "step": 3555 + }, + { + "epoch": 0.9386329180673654, + "grad_norm": 0.02450549602508545, + "learning_rate": 0.00010618815147117034, + "loss": 0.0045, + "step": 3560 + }, + { + "epoch": 0.9399512227275724, + "grad_norm": 0.1238626018166542, + "learning_rate": 0.0001060562079430004, + "loss": 0.0166, + "step": 3565 + }, + { + "epoch": 0.9412695273877794, + "grad_norm": 0.1879919469356537, + "learning_rate": 0.00010592426441483046, + "loss": 0.0077, + "step": 3570 + }, + { + "epoch": 0.9425878320479862, + "grad_norm": 0.11323565989732742, + "learning_rate": 0.00010579232088666051, + "loss": 0.0213, + "step": 3575 + }, + { + "epoch": 0.9439061367081932, + "grad_norm": 0.35575854778289795, + "learning_rate": 0.00010566037735849057, + "loss": 0.0336, + "step": 3580 + }, + { + "epoch": 0.9452244413684002, + "grad_norm": 0.14052227139472961, + "learning_rate": 0.00010552843383032062, + "loss": 0.0325, + "step": 3585 + }, + { + "epoch": 0.9465427460286072, + "grad_norm": 0.2643798887729645, + "learning_rate": 0.00010539649030215069, + "loss": 0.0192, + "step": 3590 + }, + { + "epoch": 0.9478610506888142, + "grad_norm": 0.3207031190395355, + "learning_rate": 0.00010526454677398075, + "loss": 0.0221, + "step": 3595 + }, + { + "epoch": 0.9491793553490212, + "grad_norm": 0.022803861647844315, + "learning_rate": 0.0001051326032458108, + "loss": 0.029, + "step": 3600 + }, + { + "epoch": 0.9504976600092281, + "grad_norm": 0.02511664852499962, + "learning_rate": 0.00010500065971764086, + "loss": 0.0422, + "step": 3605 + }, + { + "epoch": 0.9518159646694351, + "grad_norm": 0.06505445390939713, + "learning_rate": 0.00010486871618947091, + "loss": 0.0092, + "step": 3610 + }, + { + "epoch": 0.9531342693296421, + "grad_norm": 0.09998584538698196, + "learning_rate": 0.00010473677266130097, + "loss": 0.0242, + "step": 3615 + }, + { + "epoch": 0.9544525739898491, + "grad_norm": 0.9645698666572571, + "learning_rate": 0.00010460482913313104, + "loss": 0.0124, + "step": 3620 + }, + { + "epoch": 0.955770878650056, + "grad_norm": 0.2389964610338211, + "learning_rate": 0.0001044728856049611, + "loss": 0.0169, + "step": 3625 + }, + { + "epoch": 0.957089183310263, + "grad_norm": 2.030608654022217, + "learning_rate": 0.00010434094207679115, + "loss": 0.0518, + "step": 3630 + }, + { + "epoch": 0.9584074879704699, + "grad_norm": 0.05979987606406212, + "learning_rate": 0.0001042089985486212, + "loss": 0.0081, + "step": 3635 + }, + { + "epoch": 0.9597257926306769, + "grad_norm": 0.15761719644069672, + "learning_rate": 0.00010407705502045125, + "loss": 0.0061, + "step": 3640 + }, + { + "epoch": 0.9610440972908839, + "grad_norm": 0.6534290909767151, + "learning_rate": 0.0001039451114922813, + "loss": 0.0104, + "step": 3645 + }, + { + "epoch": 0.9623624019510909, + "grad_norm": 1.0324147939682007, + "learning_rate": 0.00010381316796411136, + "loss": 0.0381, + "step": 3650 + }, + { + "epoch": 0.9636807066112979, + "grad_norm": 0.002968872431665659, + "learning_rate": 0.00010368122443594142, + "loss": 0.0343, + "step": 3655 + }, + { + "epoch": 0.9649990112715049, + "grad_norm": 0.011243184097111225, + "learning_rate": 0.00010354928090777147, + "loss": 0.019, + "step": 3660 + }, + { + "epoch": 0.9663173159317118, + "grad_norm": 0.17663739621639252, + "learning_rate": 0.00010341733737960153, + "loss": 0.0452, + "step": 3665 + }, + { + "epoch": 0.9676356205919188, + "grad_norm": 1.2647719383239746, + "learning_rate": 0.00010328539385143158, + "loss": 0.0154, + "step": 3670 + }, + { + "epoch": 0.9689539252521258, + "grad_norm": 0.3691752552986145, + "learning_rate": 0.00010315345032326165, + "loss": 0.028, + "step": 3675 + }, + { + "epoch": 0.9702722299123328, + "grad_norm": 0.0015879774000495672, + "learning_rate": 0.00010302150679509171, + "loss": 0.0202, + "step": 3680 + }, + { + "epoch": 0.9715905345725397, + "grad_norm": 0.1441984623670578, + "learning_rate": 0.00010288956326692176, + "loss": 0.0221, + "step": 3685 + }, + { + "epoch": 0.9729088392327467, + "grad_norm": 0.20431455969810486, + "learning_rate": 0.00010275761973875182, + "loss": 0.0072, + "step": 3690 + }, + { + "epoch": 0.9742271438929536, + "grad_norm": 0.861625611782074, + "learning_rate": 0.00010262567621058187, + "loss": 0.0523, + "step": 3695 + }, + { + "epoch": 0.9755454485531606, + "grad_norm": 0.005049478262662888, + "learning_rate": 0.00010249373268241193, + "loss": 0.0051, + "step": 3700 + }, + { + "epoch": 0.9768637532133676, + "grad_norm": 0.49685510993003845, + "learning_rate": 0.000102361789154242, + "loss": 0.023, + "step": 3705 + }, + { + "epoch": 0.9781820578735746, + "grad_norm": 0.08789395540952682, + "learning_rate": 0.00010222984562607205, + "loss": 0.0159, + "step": 3710 + }, + { + "epoch": 0.9795003625337816, + "grad_norm": 0.027168691158294678, + "learning_rate": 0.00010209790209790211, + "loss": 0.0083, + "step": 3715 + }, + { + "epoch": 0.9808186671939886, + "grad_norm": 0.0006773864733986557, + "learning_rate": 0.00010196595856973217, + "loss": 0.0048, + "step": 3720 + }, + { + "epoch": 0.9821369718541955, + "grad_norm": 0.01636457070708275, + "learning_rate": 0.00010183401504156222, + "loss": 0.0159, + "step": 3725 + }, + { + "epoch": 0.9834552765144025, + "grad_norm": 0.10160859674215317, + "learning_rate": 0.00010170207151339228, + "loss": 0.0047, + "step": 3730 + }, + { + "epoch": 0.9847735811746094, + "grad_norm": 0.14173269271850586, + "learning_rate": 0.00010157012798522232, + "loss": 0.006, + "step": 3735 + }, + { + "epoch": 0.9860918858348164, + "grad_norm": 0.003458512481302023, + "learning_rate": 0.00010143818445705238, + "loss": 0.0193, + "step": 3740 + }, + { + "epoch": 0.9874101904950234, + "grad_norm": 0.005163820460438728, + "learning_rate": 0.00010130624092888243, + "loss": 0.0039, + "step": 3745 + }, + { + "epoch": 0.9887284951552304, + "grad_norm": 0.005913791712373495, + "learning_rate": 0.00010117429740071249, + "loss": 0.0119, + "step": 3750 + }, + { + "epoch": 0.9900467998154373, + "grad_norm": 0.00800853967666626, + "learning_rate": 0.00010104235387254256, + "loss": 0.044, + "step": 3755 + }, + { + "epoch": 0.9913651044756443, + "grad_norm": 0.18146778643131256, + "learning_rate": 0.00010091041034437261, + "loss": 0.0048, + "step": 3760 + }, + { + "epoch": 0.9926834091358513, + "grad_norm": 0.01235104724764824, + "learning_rate": 0.00010077846681620267, + "loss": 0.0017, + "step": 3765 + }, + { + "epoch": 0.9940017137960583, + "grad_norm": 0.17677897214889526, + "learning_rate": 0.00010064652328803272, + "loss": 0.0339, + "step": 3770 + }, + { + "epoch": 0.9953200184562653, + "grad_norm": 0.0017472271574661136, + "learning_rate": 0.00010051457975986278, + "loss": 0.0494, + "step": 3775 + }, + { + "epoch": 0.9966383231164723, + "grad_norm": 0.10814860463142395, + "learning_rate": 0.00010038263623169283, + "loss": 0.0741, + "step": 3780 + }, + { + "epoch": 0.9979566277766792, + "grad_norm": 0.11329760402441025, + "learning_rate": 0.0001002506927035229, + "loss": 0.0182, + "step": 3785 + }, + { + "epoch": 0.9992749324368861, + "grad_norm": 0.11573276668787003, + "learning_rate": 0.00010011874917535296, + "loss": 0.0068, + "step": 3790 + }, + { + "epoch": 1.000790982796124, + "grad_norm": 0.08449886739253998, + "learning_rate": 9.998680564718301e-05, + "loss": 0.0141, + "step": 3795 + }, + { + "epoch": 1.002109287456331, + "grad_norm": 0.05035184696316719, + "learning_rate": 9.985486211901307e-05, + "loss": 0.0293, + "step": 3800 + }, + { + "epoch": 1.003427592116538, + "grad_norm": 0.0255444198846817, + "learning_rate": 9.972291859084313e-05, + "loss": 0.0054, + "step": 3805 + }, + { + "epoch": 1.004745896776745, + "grad_norm": 0.0033677336759865284, + "learning_rate": 9.959097506267318e-05, + "loss": 0.0567, + "step": 3810 + }, + { + "epoch": 1.006064201436952, + "grad_norm": 0.09453682601451874, + "learning_rate": 9.945903153450324e-05, + "loss": 0.0589, + "step": 3815 + }, + { + "epoch": 1.007382506097159, + "grad_norm": 0.01592979207634926, + "learning_rate": 9.932708800633329e-05, + "loss": 0.0043, + "step": 3820 + }, + { + "epoch": 1.008700810757366, + "grad_norm": 0.002263693604618311, + "learning_rate": 9.919514447816335e-05, + "loss": 0.0195, + "step": 3825 + }, + { + "epoch": 1.010019115417573, + "grad_norm": 0.013390793465077877, + "learning_rate": 9.90632009499934e-05, + "loss": 0.0152, + "step": 3830 + }, + { + "epoch": 1.01133742007778, + "grad_norm": 0.10473847389221191, + "learning_rate": 9.893125742182346e-05, + "loss": 0.0606, + "step": 3835 + }, + { + "epoch": 1.012655724737987, + "grad_norm": 0.05837221071124077, + "learning_rate": 9.879931389365353e-05, + "loss": 0.0121, + "step": 3840 + }, + { + "epoch": 1.013974029398194, + "grad_norm": 0.3803791105747223, + "learning_rate": 9.866737036548358e-05, + "loss": 0.0386, + "step": 3845 + }, + { + "epoch": 1.0152923340584008, + "grad_norm": 0.4067519009113312, + "learning_rate": 9.853542683731364e-05, + "loss": 0.0115, + "step": 3850 + }, + { + "epoch": 1.0166106387186078, + "grad_norm": 0.02585229091346264, + "learning_rate": 9.84034833091437e-05, + "loss": 0.0214, + "step": 3855 + }, + { + "epoch": 1.0179289433788148, + "grad_norm": 0.03670825809240341, + "learning_rate": 9.827153978097374e-05, + "loss": 0.0059, + "step": 3860 + }, + { + "epoch": 1.0192472480390218, + "grad_norm": 0.014171554706990719, + "learning_rate": 9.81395962528038e-05, + "loss": 0.0145, + "step": 3865 + }, + { + "epoch": 1.0205655526992288, + "grad_norm": 0.027376385405659676, + "learning_rate": 9.800765272463386e-05, + "loss": 0.0089, + "step": 3870 + }, + { + "epoch": 1.0218838573594358, + "grad_norm": 0.03168405964970589, + "learning_rate": 9.787570919646392e-05, + "loss": 0.0132, + "step": 3875 + }, + { + "epoch": 1.0232021620196428, + "grad_norm": 0.03346199914813042, + "learning_rate": 9.774376566829397e-05, + "loss": 0.0246, + "step": 3880 + }, + { + "epoch": 1.0245204666798498, + "grad_norm": 0.00894144270569086, + "learning_rate": 9.761182214012403e-05, + "loss": 0.0105, + "step": 3885 + }, + { + "epoch": 1.0258387713400567, + "grad_norm": 0.3172806203365326, + "learning_rate": 9.747987861195409e-05, + "loss": 0.0103, + "step": 3890 + }, + { + "epoch": 1.0271570760002637, + "grad_norm": 0.009055040776729584, + "learning_rate": 9.734793508378414e-05, + "loss": 0.0103, + "step": 3895 + }, + { + "epoch": 1.0284753806604707, + "grad_norm": 0.014140011742711067, + "learning_rate": 9.721599155561421e-05, + "loss": 0.0037, + "step": 3900 + }, + { + "epoch": 1.0297936853206777, + "grad_norm": 0.008317383006215096, + "learning_rate": 9.708404802744427e-05, + "loss": 0.002, + "step": 3905 + }, + { + "epoch": 1.0311119899808845, + "grad_norm": 0.005038558971136808, + "learning_rate": 9.695210449927431e-05, + "loss": 0.0017, + "step": 3910 + }, + { + "epoch": 1.0324302946410915, + "grad_norm": 0.40058520436286926, + "learning_rate": 9.682016097110436e-05, + "loss": 0.0065, + "step": 3915 + }, + { + "epoch": 1.0337485993012985, + "grad_norm": 0.005197151098400354, + "learning_rate": 9.668821744293442e-05, + "loss": 0.0031, + "step": 3920 + }, + { + "epoch": 1.0350669039615055, + "grad_norm": 0.014353781007230282, + "learning_rate": 9.655627391476449e-05, + "loss": 0.0009, + "step": 3925 + }, + { + "epoch": 1.0363852086217125, + "grad_norm": 0.13260559737682343, + "learning_rate": 9.642433038659454e-05, + "loss": 0.0323, + "step": 3930 + }, + { + "epoch": 1.0377035132819195, + "grad_norm": 0.006795065477490425, + "learning_rate": 9.62923868584246e-05, + "loss": 0.0022, + "step": 3935 + }, + { + "epoch": 1.0390218179421264, + "grad_norm": 0.2276086062192917, + "learning_rate": 9.616044333025466e-05, + "loss": 0.0221, + "step": 3940 + }, + { + "epoch": 1.0403401226023334, + "grad_norm": 0.06121920794248581, + "learning_rate": 9.602849980208471e-05, + "loss": 0.0037, + "step": 3945 + }, + { + "epoch": 1.0416584272625404, + "grad_norm": 0.9180755019187927, + "learning_rate": 9.589655627391477e-05, + "loss": 0.0589, + "step": 3950 + }, + { + "epoch": 1.0429767319227474, + "grad_norm": 0.07515591382980347, + "learning_rate": 9.576461274574484e-05, + "loss": 0.0653, + "step": 3955 + }, + { + "epoch": 1.0442950365829544, + "grad_norm": 0.018060607835650444, + "learning_rate": 9.563266921757488e-05, + "loss": 0.0178, + "step": 3960 + }, + { + "epoch": 1.0456133412431612, + "grad_norm": 0.02751368284225464, + "learning_rate": 9.550072568940493e-05, + "loss": 0.0076, + "step": 3965 + }, + { + "epoch": 1.0469316459033682, + "grad_norm": 0.653998613357544, + "learning_rate": 9.536878216123499e-05, + "loss": 0.0066, + "step": 3970 + }, + { + "epoch": 1.0482499505635752, + "grad_norm": 0.3117768168449402, + "learning_rate": 9.523683863306505e-05, + "loss": 0.0087, + "step": 3975 + }, + { + "epoch": 1.0495682552237822, + "grad_norm": 0.013952831737697124, + "learning_rate": 9.510489510489511e-05, + "loss": 0.0037, + "step": 3980 + }, + { + "epoch": 1.0508865598839892, + "grad_norm": 0.01806250400841236, + "learning_rate": 9.497295157672517e-05, + "loss": 0.0028, + "step": 3985 + }, + { + "epoch": 1.0522048645441962, + "grad_norm": 0.13678006827831268, + "learning_rate": 9.484100804855523e-05, + "loss": 0.0533, + "step": 3990 + }, + { + "epoch": 1.0535231692044031, + "grad_norm": 0.14869382977485657, + "learning_rate": 9.470906452038528e-05, + "loss": 0.009, + "step": 3995 + }, + { + "epoch": 1.0548414738646101, + "grad_norm": 0.33614659309387207, + "learning_rate": 9.457712099221534e-05, + "loss": 0.0555, + "step": 4000 + }, + { + "epoch": 1.0548414738646101, + "eval_loss": 0.026165226474404335, + "eval_runtime": 452.2482, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 3.728, + "step": 4000 + }, + { + "epoch": 1.0561597785248171, + "grad_norm": 0.007546027656644583, + "learning_rate": 9.444517746404539e-05, + "loss": 0.0029, + "step": 4005 + }, + { + "epoch": 1.0574780831850241, + "grad_norm": 0.3720332384109497, + "learning_rate": 9.431323393587545e-05, + "loss": 0.0353, + "step": 4010 + }, + { + "epoch": 1.0587963878452311, + "grad_norm": 1.1335264444351196, + "learning_rate": 9.41812904077055e-05, + "loss": 0.0142, + "step": 4015 + }, + { + "epoch": 1.060114692505438, + "grad_norm": 0.024723488837480545, + "learning_rate": 9.404934687953556e-05, + "loss": 0.006, + "step": 4020 + }, + { + "epoch": 1.0614329971656449, + "grad_norm": 0.040354058146476746, + "learning_rate": 9.391740335136562e-05, + "loss": 0.0107, + "step": 4025 + }, + { + "epoch": 1.0627513018258519, + "grad_norm": 0.222810298204422, + "learning_rate": 9.378545982319567e-05, + "loss": 0.0273, + "step": 4030 + }, + { + "epoch": 1.0640696064860589, + "grad_norm": 0.025684095919132233, + "learning_rate": 9.365351629502574e-05, + "loss": 0.0033, + "step": 4035 + }, + { + "epoch": 1.0653879111462659, + "grad_norm": 0.05338352546095848, + "learning_rate": 9.35215727668558e-05, + "loss": 0.0052, + "step": 4040 + }, + { + "epoch": 1.0667062158064728, + "grad_norm": 0.06182330474257469, + "learning_rate": 9.338962923868585e-05, + "loss": 0.0038, + "step": 4045 + }, + { + "epoch": 1.0680245204666798, + "grad_norm": 0.012170832604169846, + "learning_rate": 9.325768571051591e-05, + "loss": 0.0018, + "step": 4050 + }, + { + "epoch": 1.0693428251268868, + "grad_norm": 0.5424306392669678, + "learning_rate": 9.312574218234596e-05, + "loss": 0.0445, + "step": 4055 + }, + { + "epoch": 1.0706611297870938, + "grad_norm": 0.017939254641532898, + "learning_rate": 9.299379865417602e-05, + "loss": 0.0389, + "step": 4060 + }, + { + "epoch": 1.0719794344473008, + "grad_norm": 0.0060431682504713535, + "learning_rate": 9.286185512600607e-05, + "loss": 0.0025, + "step": 4065 + }, + { + "epoch": 1.0732977391075078, + "grad_norm": 0.0071444883942604065, + "learning_rate": 9.272991159783613e-05, + "loss": 0.0333, + "step": 4070 + }, + { + "epoch": 1.0746160437677148, + "grad_norm": 0.29632750153541565, + "learning_rate": 9.259796806966619e-05, + "loss": 0.0151, + "step": 4075 + }, + { + "epoch": 1.0759343484279218, + "grad_norm": 0.004526323173195124, + "learning_rate": 9.246602454149624e-05, + "loss": 0.006, + "step": 4080 + }, + { + "epoch": 1.0772526530881286, + "grad_norm": 0.023945212364196777, + "learning_rate": 9.23340810133263e-05, + "loss": 0.004, + "step": 4085 + }, + { + "epoch": 1.0785709577483356, + "grad_norm": 0.13235126435756683, + "learning_rate": 9.220213748515635e-05, + "loss": 0.0059, + "step": 4090 + }, + { + "epoch": 1.0798892624085425, + "grad_norm": 0.17592330276966095, + "learning_rate": 9.207019395698642e-05, + "loss": 0.0302, + "step": 4095 + }, + { + "epoch": 1.0812075670687495, + "grad_norm": 0.004582866560667753, + "learning_rate": 9.193825042881648e-05, + "loss": 0.009, + "step": 4100 + }, + { + "epoch": 1.0825258717289565, + "grad_norm": 0.15214525163173676, + "learning_rate": 9.180630690064653e-05, + "loss": 0.0062, + "step": 4105 + }, + { + "epoch": 1.0838441763891635, + "grad_norm": 0.16535983979701996, + "learning_rate": 9.167436337247658e-05, + "loss": 0.0926, + "step": 4110 + }, + { + "epoch": 1.0851624810493705, + "grad_norm": 0.013285227119922638, + "learning_rate": 9.154241984430663e-05, + "loss": 0.0043, + "step": 4115 + }, + { + "epoch": 1.0864807857095775, + "grad_norm": 0.012116984464228153, + "learning_rate": 9.14104763161367e-05, + "loss": 0.0037, + "step": 4120 + }, + { + "epoch": 1.0877990903697845, + "grad_norm": 0.0373845212161541, + "learning_rate": 9.127853278796676e-05, + "loss": 0.0081, + "step": 4125 + }, + { + "epoch": 1.0891173950299915, + "grad_norm": 0.09324615448713303, + "learning_rate": 9.114658925979681e-05, + "loss": 0.0534, + "step": 4130 + }, + { + "epoch": 1.0904356996901985, + "grad_norm": 0.010992968454957008, + "learning_rate": 9.101464573162687e-05, + "loss": 0.0025, + "step": 4135 + }, + { + "epoch": 1.0917540043504055, + "grad_norm": 0.13710318505764008, + "learning_rate": 9.088270220345692e-05, + "loss": 0.0555, + "step": 4140 + }, + { + "epoch": 1.0930723090106123, + "grad_norm": 0.010403074324131012, + "learning_rate": 9.075075867528698e-05, + "loss": 0.0042, + "step": 4145 + }, + { + "epoch": 1.0943906136708192, + "grad_norm": 0.21544460952281952, + "learning_rate": 9.061881514711705e-05, + "loss": 0.0144, + "step": 4150 + }, + { + "epoch": 1.0957089183310262, + "grad_norm": 0.04194799065589905, + "learning_rate": 9.04868716189471e-05, + "loss": 0.0106, + "step": 4155 + }, + { + "epoch": 1.0970272229912332, + "grad_norm": 0.029204202815890312, + "learning_rate": 9.035492809077715e-05, + "loss": 0.0085, + "step": 4160 + }, + { + "epoch": 1.0983455276514402, + "grad_norm": 0.006751026958227158, + "learning_rate": 9.02229845626072e-05, + "loss": 0.0049, + "step": 4165 + }, + { + "epoch": 1.0996638323116472, + "grad_norm": 0.008232722990214825, + "learning_rate": 9.009104103443726e-05, + "loss": 0.0172, + "step": 4170 + }, + { + "epoch": 1.1009821369718542, + "grad_norm": 0.05630079656839371, + "learning_rate": 8.995909750626733e-05, + "loss": 0.0112, + "step": 4175 + }, + { + "epoch": 1.1023004416320612, + "grad_norm": 0.0011601662263274193, + "learning_rate": 8.982715397809738e-05, + "loss": 0.0317, + "step": 4180 + }, + { + "epoch": 1.1036187462922682, + "grad_norm": 0.006554402410984039, + "learning_rate": 8.969521044992744e-05, + "loss": 0.0035, + "step": 4185 + }, + { + "epoch": 1.1049370509524752, + "grad_norm": 0.34513652324676514, + "learning_rate": 8.956326692175749e-05, + "loss": 0.0036, + "step": 4190 + }, + { + "epoch": 1.1062553556126822, + "grad_norm": 0.283669650554657, + "learning_rate": 8.943132339358755e-05, + "loss": 0.0182, + "step": 4195 + }, + { + "epoch": 1.1075736602728892, + "grad_norm": 0.5376952290534973, + "learning_rate": 8.92993798654176e-05, + "loss": 0.0293, + "step": 4200 + }, + { + "epoch": 1.108891964933096, + "grad_norm": 0.01689724065363407, + "learning_rate": 8.916743633724767e-05, + "loss": 0.0206, + "step": 4205 + }, + { + "epoch": 1.110210269593303, + "grad_norm": 0.026538770645856857, + "learning_rate": 8.903549280907772e-05, + "loss": 0.0181, + "step": 4210 + }, + { + "epoch": 1.11152857425351, + "grad_norm": 0.6372873783111572, + "learning_rate": 8.890354928090777e-05, + "loss": 0.021, + "step": 4215 + }, + { + "epoch": 1.112846878913717, + "grad_norm": 0.06177428737282753, + "learning_rate": 8.877160575273783e-05, + "loss": 0.0033, + "step": 4220 + }, + { + "epoch": 1.114165183573924, + "grad_norm": 0.3712109923362732, + "learning_rate": 8.863966222456788e-05, + "loss": 0.0075, + "step": 4225 + }, + { + "epoch": 1.115483488234131, + "grad_norm": 0.030514653772115707, + "learning_rate": 8.850771869639795e-05, + "loss": 0.0183, + "step": 4230 + }, + { + "epoch": 1.116801792894338, + "grad_norm": 0.012861707247793674, + "learning_rate": 8.837577516822801e-05, + "loss": 0.0032, + "step": 4235 + }, + { + "epoch": 1.118120097554545, + "grad_norm": 0.3278522789478302, + "learning_rate": 8.824383164005806e-05, + "loss": 0.0058, + "step": 4240 + }, + { + "epoch": 1.1194384022147519, + "grad_norm": 0.580259382724762, + "learning_rate": 8.811188811188812e-05, + "loss": 0.0068, + "step": 4245 + }, + { + "epoch": 1.1207567068749589, + "grad_norm": 0.007002575788646936, + "learning_rate": 8.797994458371817e-05, + "loss": 0.0063, + "step": 4250 + }, + { + "epoch": 1.1220750115351659, + "grad_norm": 0.22484643757343292, + "learning_rate": 8.784800105554823e-05, + "loss": 0.0167, + "step": 4255 + }, + { + "epoch": 1.1233933161953726, + "grad_norm": 0.004122686106711626, + "learning_rate": 8.771605752737829e-05, + "loss": 0.002, + "step": 4260 + }, + { + "epoch": 1.1247116208555796, + "grad_norm": 0.009832561016082764, + "learning_rate": 8.758411399920834e-05, + "loss": 0.0029, + "step": 4265 + }, + { + "epoch": 1.1260299255157866, + "grad_norm": 0.04854527860879898, + "learning_rate": 8.74521704710384e-05, + "loss": 0.0068, + "step": 4270 + }, + { + "epoch": 1.1273482301759936, + "grad_norm": 0.12221235036849976, + "learning_rate": 8.732022694286845e-05, + "loss": 0.003, + "step": 4275 + }, + { + "epoch": 1.1286665348362006, + "grad_norm": 0.005857539363205433, + "learning_rate": 8.718828341469851e-05, + "loss": 0.0022, + "step": 4280 + }, + { + "epoch": 1.1299848394964076, + "grad_norm": 0.10582758486270905, + "learning_rate": 8.705633988652856e-05, + "loss": 0.002, + "step": 4285 + }, + { + "epoch": 1.1313031441566146, + "grad_norm": 0.006190940272063017, + "learning_rate": 8.692439635835863e-05, + "loss": 0.0022, + "step": 4290 + }, + { + "epoch": 1.1326214488168216, + "grad_norm": 0.00221514655277133, + "learning_rate": 8.679245283018869e-05, + "loss": 0.0314, + "step": 4295 + }, + { + "epoch": 1.1339397534770286, + "grad_norm": 0.0796755850315094, + "learning_rate": 8.666050930201874e-05, + "loss": 0.0347, + "step": 4300 + }, + { + "epoch": 1.1352580581372356, + "grad_norm": 0.20088806748390198, + "learning_rate": 8.65285657738488e-05, + "loss": 0.0048, + "step": 4305 + }, + { + "epoch": 1.1365763627974426, + "grad_norm": 0.4018377363681793, + "learning_rate": 8.639662224567884e-05, + "loss": 0.0234, + "step": 4310 + }, + { + "epoch": 1.1378946674576496, + "grad_norm": 0.014961684122681618, + "learning_rate": 8.626467871750891e-05, + "loss": 0.0033, + "step": 4315 + }, + { + "epoch": 1.1392129721178565, + "grad_norm": 0.004534922540187836, + "learning_rate": 8.613273518933897e-05, + "loss": 0.0021, + "step": 4320 + }, + { + "epoch": 1.1405312767780633, + "grad_norm": 0.06340984255075455, + "learning_rate": 8.600079166116902e-05, + "loss": 0.0538, + "step": 4325 + }, + { + "epoch": 1.1418495814382703, + "grad_norm": 0.007374623324722052, + "learning_rate": 8.586884813299908e-05, + "loss": 0.0157, + "step": 4330 + }, + { + "epoch": 1.1431678860984773, + "grad_norm": 0.02313193492591381, + "learning_rate": 8.573690460482913e-05, + "loss": 0.0307, + "step": 4335 + }, + { + "epoch": 1.1444861907586843, + "grad_norm": 0.014071634039282799, + "learning_rate": 8.560496107665919e-05, + "loss": 0.0058, + "step": 4340 + }, + { + "epoch": 1.1458044954188913, + "grad_norm": 1.4664901494979858, + "learning_rate": 8.547301754848926e-05, + "loss": 0.0566, + "step": 4345 + }, + { + "epoch": 1.1471228000790983, + "grad_norm": 0.023680074140429497, + "learning_rate": 8.534107402031931e-05, + "loss": 0.0048, + "step": 4350 + }, + { + "epoch": 1.1484411047393053, + "grad_norm": 0.012555698864161968, + "learning_rate": 8.520913049214937e-05, + "loss": 0.0076, + "step": 4355 + }, + { + "epoch": 1.1497594093995123, + "grad_norm": 0.013624129816889763, + "learning_rate": 8.507718696397941e-05, + "loss": 0.0373, + "step": 4360 + }, + { + "epoch": 1.1510777140597193, + "grad_norm": 0.015372387133538723, + "learning_rate": 8.494524343580947e-05, + "loss": 0.0147, + "step": 4365 + }, + { + "epoch": 1.1523960187199263, + "grad_norm": 0.3312993347644806, + "learning_rate": 8.481329990763954e-05, + "loss": 0.0299, + "step": 4370 + }, + { + "epoch": 1.1537143233801332, + "grad_norm": 0.023838184773921967, + "learning_rate": 8.468135637946959e-05, + "loss": 0.0226, + "step": 4375 + }, + { + "epoch": 1.15503262804034, + "grad_norm": 0.42516952753067017, + "learning_rate": 8.454941285129965e-05, + "loss": 0.0088, + "step": 4380 + }, + { + "epoch": 1.156350932700547, + "grad_norm": 0.6900278925895691, + "learning_rate": 8.44174693231297e-05, + "loss": 0.0245, + "step": 4385 + }, + { + "epoch": 1.157669237360754, + "grad_norm": 0.2932703197002411, + "learning_rate": 8.428552579495976e-05, + "loss": 0.0207, + "step": 4390 + }, + { + "epoch": 1.158987542020961, + "grad_norm": 0.12942780554294586, + "learning_rate": 8.415358226678982e-05, + "loss": 0.0037, + "step": 4395 + }, + { + "epoch": 1.160305846681168, + "grad_norm": 0.9499046802520752, + "learning_rate": 8.402163873861989e-05, + "loss": 0.0246, + "step": 4400 + }, + { + "epoch": 1.161624151341375, + "grad_norm": 0.008869118988513947, + "learning_rate": 8.388969521044994e-05, + "loss": 0.0171, + "step": 4405 + }, + { + "epoch": 1.162942456001582, + "grad_norm": 1.7409231662750244, + "learning_rate": 8.375775168227998e-05, + "loss": 0.017, + "step": 4410 + }, + { + "epoch": 1.164260760661789, + "grad_norm": 0.0020101398695260286, + "learning_rate": 8.362580815411004e-05, + "loss": 0.0027, + "step": 4415 + }, + { + "epoch": 1.165579065321996, + "grad_norm": 0.0785067081451416, + "learning_rate": 8.34938646259401e-05, + "loss": 0.0043, + "step": 4420 + }, + { + "epoch": 1.166897369982203, + "grad_norm": 0.0029506285209208727, + "learning_rate": 8.336192109777016e-05, + "loss": 0.0109, + "step": 4425 + }, + { + "epoch": 1.16821567464241, + "grad_norm": 0.02216683328151703, + "learning_rate": 8.322997756960022e-05, + "loss": 0.0026, + "step": 4430 + }, + { + "epoch": 1.1695339793026167, + "grad_norm": 0.02216639369726181, + "learning_rate": 8.309803404143027e-05, + "loss": 0.0045, + "step": 4435 + }, + { + "epoch": 1.170852283962824, + "grad_norm": 0.0, + "learning_rate": 8.296609051326033e-05, + "loss": 0.006, + "step": 4440 + }, + { + "epoch": 1.1721705886230307, + "grad_norm": 0.0019736960530281067, + "learning_rate": 8.283414698509039e-05, + "loss": 0.0078, + "step": 4445 + }, + { + "epoch": 1.1734888932832377, + "grad_norm": 0.012957746163010597, + "learning_rate": 8.270220345692044e-05, + "loss": 0.002, + "step": 4450 + }, + { + "epoch": 1.1748071979434447, + "grad_norm": 0.010877869091928005, + "learning_rate": 8.25702599287505e-05, + "loss": 0.0237, + "step": 4455 + }, + { + "epoch": 1.1761255026036517, + "grad_norm": 0.005947659723460674, + "learning_rate": 8.243831640058055e-05, + "loss": 0.0341, + "step": 4460 + }, + { + "epoch": 1.1774438072638587, + "grad_norm": 0.0005026470171287656, + "learning_rate": 8.230637287241061e-05, + "loss": 0.0033, + "step": 4465 + }, + { + "epoch": 1.1787621119240657, + "grad_norm": 0.022054588422179222, + "learning_rate": 8.217442934424066e-05, + "loss": 0.0042, + "step": 4470 + }, + { + "epoch": 1.1800804165842727, + "grad_norm": 0.7929030656814575, + "learning_rate": 8.204248581607072e-05, + "loss": 0.0076, + "step": 4475 + }, + { + "epoch": 1.1813987212444796, + "grad_norm": 0.39052629470825195, + "learning_rate": 8.191054228790078e-05, + "loss": 0.0228, + "step": 4480 + }, + { + "epoch": 1.1827170259046866, + "grad_norm": 0.007177622988820076, + "learning_rate": 8.177859875973084e-05, + "loss": 0.01, + "step": 4485 + }, + { + "epoch": 1.1840353305648936, + "grad_norm": 0.006175135262310505, + "learning_rate": 8.16466552315609e-05, + "loss": 0.0037, + "step": 4490 + }, + { + "epoch": 1.1853536352251006, + "grad_norm": 0.0356481671333313, + "learning_rate": 8.151471170339096e-05, + "loss": 0.0024, + "step": 4495 + }, + { + "epoch": 1.1866719398853074, + "grad_norm": 0.19069480895996094, + "learning_rate": 8.138276817522101e-05, + "loss": 0.0048, + "step": 4500 + }, + { + "epoch": 1.1866719398853074, + "eval_loss": 0.026386437937617302, + "eval_runtime": 452.2896, + "eval_samples_per_second": 7.455, + "eval_steps_per_second": 3.728, + "step": 4500 + }, + { + "epoch": 1.1879902445455144, + "grad_norm": 0.002254961524158716, + "learning_rate": 8.125082464705107e-05, + "loss": 0.0014, + "step": 4505 + }, + { + "epoch": 1.1893085492057214, + "grad_norm": 0.8026870489120483, + "learning_rate": 8.111888111888112e-05, + "loss": 0.0411, + "step": 4510 + }, + { + "epoch": 1.1906268538659284, + "grad_norm": 0.47328072786331177, + "learning_rate": 8.098693759071118e-05, + "loss": 0.0271, + "step": 4515 + }, + { + "epoch": 1.1919451585261354, + "grad_norm": 0.4888288676738739, + "learning_rate": 8.085499406254123e-05, + "loss": 0.039, + "step": 4520 + }, + { + "epoch": 1.1932634631863424, + "grad_norm": 0.000925812462810427, + "learning_rate": 8.072305053437129e-05, + "loss": 0.0461, + "step": 4525 + }, + { + "epoch": 1.1945817678465493, + "grad_norm": 0.12472371757030487, + "learning_rate": 8.059110700620135e-05, + "loss": 0.0037, + "step": 4530 + }, + { + "epoch": 1.1959000725067563, + "grad_norm": 0.002875336678698659, + "learning_rate": 8.04591634780314e-05, + "loss": 0.0425, + "step": 4535 + }, + { + "epoch": 1.1972183771669633, + "grad_norm": 0.042056187987327576, + "learning_rate": 8.032721994986147e-05, + "loss": 0.0068, + "step": 4540 + }, + { + "epoch": 1.1985366818271703, + "grad_norm": 0.157605841755867, + "learning_rate": 8.019527642169153e-05, + "loss": 0.0179, + "step": 4545 + }, + { + "epoch": 1.1998549864873773, + "grad_norm": 0.005153563339263201, + "learning_rate": 8.006333289352158e-05, + "loss": 0.0045, + "step": 4550 + }, + { + "epoch": 1.201173291147584, + "grad_norm": 0.02541598491370678, + "learning_rate": 7.993138936535164e-05, + "loss": 0.0041, + "step": 4555 + }, + { + "epoch": 1.2024915958077913, + "grad_norm": 0.04266195371747017, + "learning_rate": 7.979944583718168e-05, + "loss": 0.0121, + "step": 4560 + }, + { + "epoch": 1.203809900467998, + "grad_norm": 0.36108532547950745, + "learning_rate": 7.966750230901175e-05, + "loss": 0.0147, + "step": 4565 + }, + { + "epoch": 1.205128205128205, + "grad_norm": 0.40405452251434326, + "learning_rate": 7.95355587808418e-05, + "loss": 0.0056, + "step": 4570 + }, + { + "epoch": 1.206446509788412, + "grad_norm": 0.030422702431678772, + "learning_rate": 7.940361525267186e-05, + "loss": 0.0055, + "step": 4575 + }, + { + "epoch": 1.207764814448619, + "grad_norm": 0.014555396512150764, + "learning_rate": 7.927167172450192e-05, + "loss": 0.0029, + "step": 4580 + }, + { + "epoch": 1.209083119108826, + "grad_norm": 0.33962950110435486, + "learning_rate": 7.913972819633197e-05, + "loss": 0.0191, + "step": 4585 + }, + { + "epoch": 1.210401423769033, + "grad_norm": 0.040150560438632965, + "learning_rate": 7.900778466816203e-05, + "loss": 0.0096, + "step": 4590 + }, + { + "epoch": 1.21171972842924, + "grad_norm": 0.2968510091304779, + "learning_rate": 7.88758411399921e-05, + "loss": 0.0311, + "step": 4595 + }, + { + "epoch": 1.213038033089447, + "grad_norm": 0.04709814116358757, + "learning_rate": 7.874389761182215e-05, + "loss": 0.0175, + "step": 4600 + }, + { + "epoch": 1.214356337749654, + "grad_norm": 0.1379537284374237, + "learning_rate": 7.861195408365221e-05, + "loss": 0.02, + "step": 4605 + }, + { + "epoch": 1.215674642409861, + "grad_norm": 0.018291711807250977, + "learning_rate": 7.848001055548225e-05, + "loss": 0.003, + "step": 4610 + }, + { + "epoch": 1.216992947070068, + "grad_norm": 0.041676126420497894, + "learning_rate": 7.83480670273123e-05, + "loss": 0.0054, + "step": 4615 + }, + { + "epoch": 1.2183112517302748, + "grad_norm": 0.0013747498160228133, + "learning_rate": 7.821612349914237e-05, + "loss": 0.0132, + "step": 4620 + }, + { + "epoch": 1.2196295563904818, + "grad_norm": 0.0050489697605371475, + "learning_rate": 7.808417997097243e-05, + "loss": 0.0272, + "step": 4625 + }, + { + "epoch": 1.2209478610506888, + "grad_norm": 0.017974581569433212, + "learning_rate": 7.795223644280249e-05, + "loss": 0.0037, + "step": 4630 + }, + { + "epoch": 1.2222661657108957, + "grad_norm": 0.001916698063723743, + "learning_rate": 7.782029291463254e-05, + "loss": 0.002, + "step": 4635 + }, + { + "epoch": 1.2235844703711027, + "grad_norm": 0.05344574153423309, + "learning_rate": 7.76883493864626e-05, + "loss": 0.0114, + "step": 4640 + }, + { + "epoch": 1.2249027750313097, + "grad_norm": 0.22823786735534668, + "learning_rate": 7.755640585829265e-05, + "loss": 0.0296, + "step": 4645 + }, + { + "epoch": 1.2262210796915167, + "grad_norm": 0.02051074244081974, + "learning_rate": 7.742446233012272e-05, + "loss": 0.0037, + "step": 4650 + }, + { + "epoch": 1.2275393843517237, + "grad_norm": 0.9797061681747437, + "learning_rate": 7.729251880195276e-05, + "loss": 0.011, + "step": 4655 + }, + { + "epoch": 1.2288576890119307, + "grad_norm": 0.0017285927897319198, + "learning_rate": 7.716057527378282e-05, + "loss": 0.0224, + "step": 4660 + }, + { + "epoch": 1.2301759936721377, + "grad_norm": 0.021783018484711647, + "learning_rate": 7.702863174561288e-05, + "loss": 0.0174, + "step": 4665 + }, + { + "epoch": 1.2314942983323447, + "grad_norm": 0.00763307698071003, + "learning_rate": 7.689668821744293e-05, + "loss": 0.0516, + "step": 4670 + }, + { + "epoch": 1.2328126029925515, + "grad_norm": 0.32605209946632385, + "learning_rate": 7.676474468927299e-05, + "loss": 0.0301, + "step": 4675 + }, + { + "epoch": 1.2341309076527585, + "grad_norm": 1.2027722597122192, + "learning_rate": 7.663280116110306e-05, + "loss": 0.0474, + "step": 4680 + }, + { + "epoch": 1.2354492123129655, + "grad_norm": 0.10201717168092728, + "learning_rate": 7.650085763293311e-05, + "loss": 0.0144, + "step": 4685 + }, + { + "epoch": 1.2367675169731724, + "grad_norm": 0.013835664838552475, + "learning_rate": 7.636891410476317e-05, + "loss": 0.0024, + "step": 4690 + }, + { + "epoch": 1.2380858216333794, + "grad_norm": 0.005699916277080774, + "learning_rate": 7.623697057659322e-05, + "loss": 0.0089, + "step": 4695 + }, + { + "epoch": 1.2394041262935864, + "grad_norm": 0.16583332419395447, + "learning_rate": 7.610502704842328e-05, + "loss": 0.019, + "step": 4700 + }, + { + "epoch": 1.2407224309537934, + "grad_norm": 0.2734023332595825, + "learning_rate": 7.597308352025333e-05, + "loss": 0.0041, + "step": 4705 + }, + { + "epoch": 1.2420407356140004, + "grad_norm": 0.04209504276514053, + "learning_rate": 7.584113999208339e-05, + "loss": 0.0292, + "step": 4710 + }, + { + "epoch": 1.2433590402742074, + "grad_norm": 0.0303195733577013, + "learning_rate": 7.570919646391345e-05, + "loss": 0.0019, + "step": 4715 + }, + { + "epoch": 1.2446773449344144, + "grad_norm": 0.014011899940669537, + "learning_rate": 7.55772529357435e-05, + "loss": 0.0236, + "step": 4720 + }, + { + "epoch": 1.2459956495946214, + "grad_norm": 0.37838876247406006, + "learning_rate": 7.544530940757356e-05, + "loss": 0.0081, + "step": 4725 + }, + { + "epoch": 1.2473139542548284, + "grad_norm": 0.003717717481777072, + "learning_rate": 7.531336587940361e-05, + "loss": 0.0036, + "step": 4730 + }, + { + "epoch": 1.2486322589150354, + "grad_norm": 1.2284752130508423, + "learning_rate": 7.518142235123368e-05, + "loss": 0.0089, + "step": 4735 + }, + { + "epoch": 1.2499505635752421, + "grad_norm": 0.015356095507740974, + "learning_rate": 7.504947882306374e-05, + "loss": 0.0074, + "step": 4740 + }, + { + "epoch": 1.2512688682354491, + "grad_norm": 0.0020383282098919153, + "learning_rate": 7.49175352948938e-05, + "loss": 0.0444, + "step": 4745 + }, + { + "epoch": 1.2525871728956561, + "grad_norm": 0.006680132355540991, + "learning_rate": 7.478559176672385e-05, + "loss": 0.009, + "step": 4750 + }, + { + "epoch": 1.2539054775558631, + "grad_norm": 0.01650019735097885, + "learning_rate": 7.465364823855389e-05, + "loss": 0.0022, + "step": 4755 + }, + { + "epoch": 1.2552237822160701, + "grad_norm": 0.009536102414131165, + "learning_rate": 7.452170471038396e-05, + "loss": 0.0026, + "step": 4760 + }, + { + "epoch": 1.256542086876277, + "grad_norm": 0.04677430912852287, + "learning_rate": 7.438976118221402e-05, + "loss": 0.004, + "step": 4765 + }, + { + "epoch": 1.257860391536484, + "grad_norm": 0.007777783088386059, + "learning_rate": 7.425781765404407e-05, + "loss": 0.0112, + "step": 4770 + }, + { + "epoch": 1.259178696196691, + "grad_norm": 0.03724197298288345, + "learning_rate": 7.412587412587413e-05, + "loss": 0.0065, + "step": 4775 + }, + { + "epoch": 1.260497000856898, + "grad_norm": 0.0023958412930369377, + "learning_rate": 7.399393059770418e-05, + "loss": 0.0238, + "step": 4780 + }, + { + "epoch": 1.261815305517105, + "grad_norm": 0.0036889975890517235, + "learning_rate": 7.386198706953424e-05, + "loss": 0.0012, + "step": 4785 + }, + { + "epoch": 1.263133610177312, + "grad_norm": 0.0009220903157256544, + "learning_rate": 7.373004354136431e-05, + "loss": 0.0017, + "step": 4790 + }, + { + "epoch": 1.2644519148375188, + "grad_norm": 0.0033395602367818356, + "learning_rate": 7.359810001319436e-05, + "loss": 0.0474, + "step": 4795 + }, + { + "epoch": 1.265770219497726, + "grad_norm": 0.004093261435627937, + "learning_rate": 7.346615648502442e-05, + "loss": 0.0025, + "step": 4800 + }, + { + "epoch": 1.2670885241579328, + "grad_norm": 0.004395488649606705, + "learning_rate": 7.333421295685446e-05, + "loss": 0.0011, + "step": 4805 + }, + { + "epoch": 1.2684068288181398, + "grad_norm": 0.024034051224589348, + "learning_rate": 7.320226942868452e-05, + "loss": 0.0027, + "step": 4810 + }, + { + "epoch": 1.2697251334783468, + "grad_norm": 0.9501499533653259, + "learning_rate": 7.307032590051459e-05, + "loss": 0.0279, + "step": 4815 + }, + { + "epoch": 1.2710434381385538, + "grad_norm": 0.008805549703538418, + "learning_rate": 7.293838237234464e-05, + "loss": 0.0403, + "step": 4820 + }, + { + "epoch": 1.2723617427987608, + "grad_norm": 0.01750873774290085, + "learning_rate": 7.28064388441747e-05, + "loss": 0.0571, + "step": 4825 + }, + { + "epoch": 1.2736800474589678, + "grad_norm": 0.004490260500460863, + "learning_rate": 7.267449531600475e-05, + "loss": 0.0269, + "step": 4830 + }, + { + "epoch": 1.2749983521191748, + "grad_norm": 0.07510064542293549, + "learning_rate": 7.254255178783481e-05, + "loss": 0.0123, + "step": 4835 + }, + { + "epoch": 1.2763166567793818, + "grad_norm": 0.039783038198947906, + "learning_rate": 7.241060825966486e-05, + "loss": 0.0137, + "step": 4840 + }, + { + "epoch": 1.2776349614395888, + "grad_norm": 0.019004900008440018, + "learning_rate": 7.227866473149493e-05, + "loss": 0.0047, + "step": 4845 + }, + { + "epoch": 1.2789532660997955, + "grad_norm": 0.04813052713871002, + "learning_rate": 7.214672120332499e-05, + "loss": 0.0021, + "step": 4850 + }, + { + "epoch": 1.2802715707600028, + "grad_norm": 0.00835048221051693, + "learning_rate": 7.201477767515503e-05, + "loss": 0.0014, + "step": 4855 + }, + { + "epoch": 1.2815898754202095, + "grad_norm": 0.008609198965132236, + "learning_rate": 7.188283414698509e-05, + "loss": 0.0219, + "step": 4860 + }, + { + "epoch": 1.2829081800804165, + "grad_norm": 0.007337458431720734, + "learning_rate": 7.175089061881514e-05, + "loss": 0.0014, + "step": 4865 + }, + { + "epoch": 1.2842264847406235, + "grad_norm": 0.0032645913306623697, + "learning_rate": 7.161894709064521e-05, + "loss": 0.0026, + "step": 4870 + }, + { + "epoch": 1.2855447894008305, + "grad_norm": 0.27384671568870544, + "learning_rate": 7.148700356247527e-05, + "loss": 0.0227, + "step": 4875 + }, + { + "epoch": 1.2868630940610375, + "grad_norm": 0.03584875538945198, + "learning_rate": 7.135506003430532e-05, + "loss": 0.0299, + "step": 4880 + }, + { + "epoch": 1.2881813987212445, + "grad_norm": 0.03482440486550331, + "learning_rate": 7.122311650613538e-05, + "loss": 0.0125, + "step": 4885 + }, + { + "epoch": 1.2894997033814515, + "grad_norm": 0.005974395200610161, + "learning_rate": 7.109117297796543e-05, + "loss": 0.0029, + "step": 4890 + }, + { + "epoch": 1.2908180080416585, + "grad_norm": 0.01820153370499611, + "learning_rate": 7.095922944979549e-05, + "loss": 0.0254, + "step": 4895 + }, + { + "epoch": 1.2921363127018655, + "grad_norm": 0.1733965277671814, + "learning_rate": 7.082728592162555e-05, + "loss": 0.028, + "step": 4900 + }, + { + "epoch": 1.2934546173620725, + "grad_norm": 1.3017303943634033, + "learning_rate": 7.06953423934556e-05, + "loss": 0.0213, + "step": 4905 + }, + { + "epoch": 1.2947729220222794, + "grad_norm": 0.01360877975821495, + "learning_rate": 7.056339886528566e-05, + "loss": 0.0039, + "step": 4910 + }, + { + "epoch": 1.2960912266824862, + "grad_norm": 0.01503999624401331, + "learning_rate": 7.043145533711571e-05, + "loss": 0.0102, + "step": 4915 + }, + { + "epoch": 1.2974095313426934, + "grad_norm": 0.2200804352760315, + "learning_rate": 7.029951180894577e-05, + "loss": 0.0461, + "step": 4920 + }, + { + "epoch": 1.2987278360029002, + "grad_norm": 0.08512946963310242, + "learning_rate": 7.016756828077582e-05, + "loss": 0.0066, + "step": 4925 + }, + { + "epoch": 1.3000461406631072, + "grad_norm": 0.08296570926904678, + "learning_rate": 7.00356247526059e-05, + "loss": 0.0223, + "step": 4930 + }, + { + "epoch": 1.3013644453233142, + "grad_norm": 0.008866079151630402, + "learning_rate": 6.990368122443595e-05, + "loss": 0.0032, + "step": 4935 + }, + { + "epoch": 1.3026827499835212, + "grad_norm": 0.024493014439940453, + "learning_rate": 6.9771737696266e-05, + "loss": 0.0128, + "step": 4940 + }, + { + "epoch": 1.3040010546437282, + "grad_norm": 0.08965341746807098, + "learning_rate": 6.963979416809606e-05, + "loss": 0.028, + "step": 4945 + }, + { + "epoch": 1.3053193593039352, + "grad_norm": 0.023156631737947464, + "learning_rate": 6.950785063992612e-05, + "loss": 0.0187, + "step": 4950 + }, + { + "epoch": 1.3066376639641422, + "grad_norm": 0.18552155792713165, + "learning_rate": 6.937590711175617e-05, + "loss": 0.0424, + "step": 4955 + }, + { + "epoch": 1.3079559686243492, + "grad_norm": 0.02200198918581009, + "learning_rate": 6.924396358358623e-05, + "loss": 0.0148, + "step": 4960 + }, + { + "epoch": 1.3092742732845561, + "grad_norm": 0.00568364467471838, + "learning_rate": 6.911202005541628e-05, + "loss": 0.0199, + "step": 4965 + }, + { + "epoch": 1.310592577944763, + "grad_norm": 0.021591177210211754, + "learning_rate": 6.898007652724634e-05, + "loss": 0.0092, + "step": 4970 + }, + { + "epoch": 1.3119108826049701, + "grad_norm": 0.327177494764328, + "learning_rate": 6.88481329990764e-05, + "loss": 0.0047, + "step": 4975 + }, + { + "epoch": 1.313229187265177, + "grad_norm": 0.024512887001037598, + "learning_rate": 6.871618947090645e-05, + "loss": 0.0046, + "step": 4980 + }, + { + "epoch": 1.314547491925384, + "grad_norm": 0.05725006014108658, + "learning_rate": 6.858424594273652e-05, + "loss": 0.0227, + "step": 4985 + }, + { + "epoch": 1.3158657965855909, + "grad_norm": 0.011280277743935585, + "learning_rate": 6.845230241456658e-05, + "loss": 0.0056, + "step": 4990 + }, + { + "epoch": 1.3171841012457979, + "grad_norm": 0.022504402324557304, + "learning_rate": 6.832035888639663e-05, + "loss": 0.0029, + "step": 4995 + }, + { + "epoch": 1.3185024059060049, + "grad_norm": 0.02168826013803482, + "learning_rate": 6.818841535822669e-05, + "loss": 0.0198, + "step": 5000 + }, + { + "epoch": 1.3185024059060049, + "eval_loss": 0.025039294734597206, + "eval_runtime": 452.1097, + "eval_samples_per_second": 7.458, + "eval_steps_per_second": 3.729, + "step": 5000 + }, + { + "epoch": 1.3198207105662119, + "grad_norm": 0.0064329709857702255, + "learning_rate": 6.805647183005673e-05, + "loss": 0.0299, + "step": 5005 + }, + { + "epoch": 1.3211390152264189, + "grad_norm": 0.00267885928042233, + "learning_rate": 6.79245283018868e-05, + "loss": 0.0065, + "step": 5010 + }, + { + "epoch": 1.3224573198866258, + "grad_norm": 0.6842889189720154, + "learning_rate": 6.779258477371685e-05, + "loss": 0.008, + "step": 5015 + }, + { + "epoch": 1.3237756245468328, + "grad_norm": 0.002985635306686163, + "learning_rate": 6.766064124554691e-05, + "loss": 0.0119, + "step": 5020 + }, + { + "epoch": 1.3250939292070396, + "grad_norm": 0.019304940477013588, + "learning_rate": 6.752869771737696e-05, + "loss": 0.0041, + "step": 5025 + }, + { + "epoch": 1.3264122338672468, + "grad_norm": 0.011305035091936588, + "learning_rate": 6.739675418920702e-05, + "loss": 0.0031, + "step": 5030 + }, + { + "epoch": 1.3277305385274536, + "grad_norm": 0.006184784695506096, + "learning_rate": 6.726481066103708e-05, + "loss": 0.0081, + "step": 5035 + }, + { + "epoch": 1.3290488431876606, + "grad_norm": 0.0073184361681342125, + "learning_rate": 6.713286713286715e-05, + "loss": 0.0202, + "step": 5040 + }, + { + "epoch": 1.3303671478478676, + "grad_norm": 0.006566181313246489, + "learning_rate": 6.70009236046972e-05, + "loss": 0.0052, + "step": 5045 + }, + { + "epoch": 1.3316854525080746, + "grad_norm": 0.31427526473999023, + "learning_rate": 6.686898007652726e-05, + "loss": 0.017, + "step": 5050 + }, + { + "epoch": 1.3330037571682816, + "grad_norm": 0.005085447803139687, + "learning_rate": 6.67370365483573e-05, + "loss": 0.009, + "step": 5055 + }, + { + "epoch": 1.3343220618284886, + "grad_norm": 0.2745366096496582, + "learning_rate": 6.660509302018735e-05, + "loss": 0.0119, + "step": 5060 + }, + { + "epoch": 1.3356403664886956, + "grad_norm": 0.2871796786785126, + "learning_rate": 6.647314949201742e-05, + "loss": 0.0158, + "step": 5065 + }, + { + "epoch": 1.3369586711489025, + "grad_norm": 0.2774186134338379, + "learning_rate": 6.634120596384748e-05, + "loss": 0.0084, + "step": 5070 + }, + { + "epoch": 1.3382769758091095, + "grad_norm": 0.013278775848448277, + "learning_rate": 6.620926243567753e-05, + "loss": 0.0111, + "step": 5075 + }, + { + "epoch": 1.3395952804693165, + "grad_norm": 0.01614517532289028, + "learning_rate": 6.607731890750759e-05, + "loss": 0.0066, + "step": 5080 + }, + { + "epoch": 1.3409135851295235, + "grad_norm": 0.0037789656780660152, + "learning_rate": 6.594537537933765e-05, + "loss": 0.0142, + "step": 5085 + }, + { + "epoch": 1.3422318897897303, + "grad_norm": 0.03221861273050308, + "learning_rate": 6.58134318511677e-05, + "loss": 0.0155, + "step": 5090 + }, + { + "epoch": 1.3435501944499375, + "grad_norm": 0.005637989845126867, + "learning_rate": 6.568148832299776e-05, + "loss": 0.0022, + "step": 5095 + }, + { + "epoch": 1.3448684991101443, + "grad_norm": 0.0017844432732090354, + "learning_rate": 6.554954479482783e-05, + "loss": 0.0217, + "step": 5100 + }, + { + "epoch": 1.3461868037703513, + "grad_norm": 0.08099021762609482, + "learning_rate": 6.541760126665787e-05, + "loss": 0.0222, + "step": 5105 + }, + { + "epoch": 1.3475051084305583, + "grad_norm": 0.011909045279026031, + "learning_rate": 6.528565773848792e-05, + "loss": 0.0058, + "step": 5110 + }, + { + "epoch": 1.3488234130907653, + "grad_norm": 0.7332578301429749, + "learning_rate": 6.515371421031798e-05, + "loss": 0.0286, + "step": 5115 + }, + { + "epoch": 1.3501417177509722, + "grad_norm": 0.3415885865688324, + "learning_rate": 6.502177068214804e-05, + "loss": 0.1191, + "step": 5120 + }, + { + "epoch": 1.3514600224111792, + "grad_norm": 0.00904211588203907, + "learning_rate": 6.48898271539781e-05, + "loss": 0.0043, + "step": 5125 + }, + { + "epoch": 1.3527783270713862, + "grad_norm": 0.1978830248117447, + "learning_rate": 6.475788362580816e-05, + "loss": 0.0316, + "step": 5130 + }, + { + "epoch": 1.3540966317315932, + "grad_norm": 0.10229042172431946, + "learning_rate": 6.462594009763822e-05, + "loss": 0.0194, + "step": 5135 + }, + { + "epoch": 1.3554149363918002, + "grad_norm": 0.4457210600376129, + "learning_rate": 6.449399656946827e-05, + "loss": 0.0276, + "step": 5140 + }, + { + "epoch": 1.356733241052007, + "grad_norm": 0.023706572130322456, + "learning_rate": 6.436205304129833e-05, + "loss": 0.0163, + "step": 5145 + }, + { + "epoch": 1.3580515457122142, + "grad_norm": 1.166896939277649, + "learning_rate": 6.423010951312838e-05, + "loss": 0.0189, + "step": 5150 + }, + { + "epoch": 1.359369850372421, + "grad_norm": 0.0016115796752274036, + "learning_rate": 6.409816598495844e-05, + "loss": 0.0191, + "step": 5155 + }, + { + "epoch": 1.360688155032628, + "grad_norm": 0.00786682777106762, + "learning_rate": 6.39662224567885e-05, + "loss": 0.0119, + "step": 5160 + }, + { + "epoch": 1.362006459692835, + "grad_norm": 1.042732834815979, + "learning_rate": 6.383427892861855e-05, + "loss": 0.0497, + "step": 5165 + }, + { + "epoch": 1.363324764353042, + "grad_norm": 0.007983304560184479, + "learning_rate": 6.37023354004486e-05, + "loss": 0.044, + "step": 5170 + }, + { + "epoch": 1.364643069013249, + "grad_norm": 0.009767642244696617, + "learning_rate": 6.357039187227866e-05, + "loss": 0.0405, + "step": 5175 + }, + { + "epoch": 1.365961373673456, + "grad_norm": 0.03164628520607948, + "learning_rate": 6.343844834410873e-05, + "loss": 0.0138, + "step": 5180 + }, + { + "epoch": 1.367279678333663, + "grad_norm": 0.004159921780228615, + "learning_rate": 6.330650481593879e-05, + "loss": 0.0045, + "step": 5185 + }, + { + "epoch": 1.36859798299387, + "grad_norm": 0.004395391326397657, + "learning_rate": 6.317456128776884e-05, + "loss": 0.0046, + "step": 5190 + }, + { + "epoch": 1.369916287654077, + "grad_norm": 0.011886746622622013, + "learning_rate": 6.30426177595989e-05, + "loss": 0.0064, + "step": 5195 + }, + { + "epoch": 1.371234592314284, + "grad_norm": 0.2259266972541809, + "learning_rate": 6.291067423142895e-05, + "loss": 0.0076, + "step": 5200 + }, + { + "epoch": 1.372552896974491, + "grad_norm": 0.01407301053404808, + "learning_rate": 6.277873070325901e-05, + "loss": 0.0201, + "step": 5205 + }, + { + "epoch": 1.3738712016346977, + "grad_norm": 0.00911578256636858, + "learning_rate": 6.264678717508906e-05, + "loss": 0.0164, + "step": 5210 + }, + { + "epoch": 1.3751895062949049, + "grad_norm": 0.20968014001846313, + "learning_rate": 6.251484364691912e-05, + "loss": 0.0075, + "step": 5215 + }, + { + "epoch": 1.3765078109551117, + "grad_norm": 0.008801166899502277, + "learning_rate": 6.238290011874918e-05, + "loss": 0.0068, + "step": 5220 + }, + { + "epoch": 1.3778261156153186, + "grad_norm": 0.007181806955486536, + "learning_rate": 6.225095659057923e-05, + "loss": 0.0136, + "step": 5225 + }, + { + "epoch": 1.3791444202755256, + "grad_norm": 0.7527109980583191, + "learning_rate": 6.211901306240929e-05, + "loss": 0.0287, + "step": 5230 + }, + { + "epoch": 1.3804627249357326, + "grad_norm": 0.039015207439661026, + "learning_rate": 6.198706953423936e-05, + "loss": 0.0326, + "step": 5235 + }, + { + "epoch": 1.3817810295959396, + "grad_norm": 0.021076606586575508, + "learning_rate": 6.185512600606941e-05, + "loss": 0.0191, + "step": 5240 + }, + { + "epoch": 1.3830993342561466, + "grad_norm": 0.016630731523036957, + "learning_rate": 6.172318247789947e-05, + "loss": 0.0131, + "step": 5245 + }, + { + "epoch": 1.3844176389163536, + "grad_norm": 0.011133644729852676, + "learning_rate": 6.159123894972952e-05, + "loss": 0.0029, + "step": 5250 + }, + { + "epoch": 1.3857359435765606, + "grad_norm": 0.6434677243232727, + "learning_rate": 6.145929542155957e-05, + "loss": 0.0091, + "step": 5255 + }, + { + "epoch": 1.3870542482367676, + "grad_norm": 0.051020298153162, + "learning_rate": 6.132735189338964e-05, + "loss": 0.0086, + "step": 5260 + }, + { + "epoch": 1.3883725528969744, + "grad_norm": 0.016413932666182518, + "learning_rate": 6.119540836521969e-05, + "loss": 0.0061, + "step": 5265 + }, + { + "epoch": 1.3896908575571816, + "grad_norm": 0.005769540090113878, + "learning_rate": 6.106346483704975e-05, + "loss": 0.0027, + "step": 5270 + }, + { + "epoch": 1.3910091622173884, + "grad_norm": 0.06687796860933304, + "learning_rate": 6.09315213088798e-05, + "loss": 0.0423, + "step": 5275 + }, + { + "epoch": 1.3923274668775953, + "grad_norm": 0.005641553085297346, + "learning_rate": 6.079957778070986e-05, + "loss": 0.0353, + "step": 5280 + }, + { + "epoch": 1.3936457715378023, + "grad_norm": 0.04460568353533745, + "learning_rate": 6.066763425253992e-05, + "loss": 0.0041, + "step": 5285 + }, + { + "epoch": 1.3949640761980093, + "grad_norm": 0.0387534461915493, + "learning_rate": 6.0535690724369976e-05, + "loss": 0.006, + "step": 5290 + }, + { + "epoch": 1.3962823808582163, + "grad_norm": 0.010292598977684975, + "learning_rate": 6.040374719620003e-05, + "loss": 0.0038, + "step": 5295 + }, + { + "epoch": 1.3976006855184233, + "grad_norm": 0.3646155297756195, + "learning_rate": 6.0271803668030094e-05, + "loss": 0.0111, + "step": 5300 + }, + { + "epoch": 1.3989189901786303, + "grad_norm": 0.022035539150238037, + "learning_rate": 6.0139860139860136e-05, + "loss": 0.0507, + "step": 5305 + }, + { + "epoch": 1.4002372948388373, + "grad_norm": 0.003314939560368657, + "learning_rate": 6.00079166116902e-05, + "loss": 0.0132, + "step": 5310 + }, + { + "epoch": 1.4015555994990443, + "grad_norm": 0.0838267058134079, + "learning_rate": 5.9875973083520254e-05, + "loss": 0.0105, + "step": 5315 + }, + { + "epoch": 1.4028739041592513, + "grad_norm": 0.009368584491312504, + "learning_rate": 5.974402955535031e-05, + "loss": 0.0026, + "step": 5320 + }, + { + "epoch": 1.4041922088194583, + "grad_norm": 0.031248098239302635, + "learning_rate": 5.961208602718037e-05, + "loss": 0.0151, + "step": 5325 + }, + { + "epoch": 1.405510513479665, + "grad_norm": 0.06447605788707733, + "learning_rate": 5.948014249901043e-05, + "loss": 0.0219, + "step": 5330 + }, + { + "epoch": 1.4068288181398723, + "grad_norm": 0.010814374312758446, + "learning_rate": 5.9348198970840484e-05, + "loss": 0.0038, + "step": 5335 + }, + { + "epoch": 1.408147122800079, + "grad_norm": 0.6235967874526978, + "learning_rate": 5.9216255442670546e-05, + "loss": 0.0354, + "step": 5340 + }, + { + "epoch": 1.409465427460286, + "grad_norm": 0.026741521432995796, + "learning_rate": 5.90843119145006e-05, + "loss": 0.0032, + "step": 5345 + }, + { + "epoch": 1.410783732120493, + "grad_norm": 0.019413433969020844, + "learning_rate": 5.895236838633066e-05, + "loss": 0.0216, + "step": 5350 + }, + { + "epoch": 1.4121020367807, + "grad_norm": 0.0735543966293335, + "learning_rate": 5.8820424858160706e-05, + "loss": 0.0033, + "step": 5355 + }, + { + "epoch": 1.413420341440907, + "grad_norm": 0.005189546383917332, + "learning_rate": 5.868848132999076e-05, + "loss": 0.021, + "step": 5360 + }, + { + "epoch": 1.414738646101114, + "grad_norm": 0.21240335702896118, + "learning_rate": 5.8556537801820824e-05, + "loss": 0.0294, + "step": 5365 + }, + { + "epoch": 1.416056950761321, + "grad_norm": 0.010165920481085777, + "learning_rate": 5.842459427365088e-05, + "loss": 0.0021, + "step": 5370 + }, + { + "epoch": 1.417375255421528, + "grad_norm": 0.026774069294333458, + "learning_rate": 5.8292650745480936e-05, + "loss": 0.0299, + "step": 5375 + }, + { + "epoch": 1.418693560081735, + "grad_norm": 0.0019810455851256847, + "learning_rate": 5.816070721731099e-05, + "loss": 0.0029, + "step": 5380 + }, + { + "epoch": 1.4200118647419417, + "grad_norm": 0.038888879120349884, + "learning_rate": 5.8028763689141054e-05, + "loss": 0.0069, + "step": 5385 + }, + { + "epoch": 1.421330169402149, + "grad_norm": 0.016180936247110367, + "learning_rate": 5.789682016097111e-05, + "loss": 0.0032, + "step": 5390 + }, + { + "epoch": 1.4226484740623557, + "grad_norm": 0.01119404286146164, + "learning_rate": 5.7764876632801165e-05, + "loss": 0.0024, + "step": 5395 + }, + { + "epoch": 1.4239667787225627, + "grad_norm": 0.010486694052815437, + "learning_rate": 5.763293310463123e-05, + "loss": 0.0324, + "step": 5400 + }, + { + "epoch": 1.4252850833827697, + "grad_norm": 0.005453066434711218, + "learning_rate": 5.750098957646127e-05, + "loss": 0.0038, + "step": 5405 + }, + { + "epoch": 1.4266033880429767, + "grad_norm": 0.17556461691856384, + "learning_rate": 5.736904604829133e-05, + "loss": 0.0305, + "step": 5410 + }, + { + "epoch": 1.4279216927031837, + "grad_norm": 0.03074715845286846, + "learning_rate": 5.723710252012139e-05, + "loss": 0.003, + "step": 5415 + }, + { + "epoch": 1.4292399973633907, + "grad_norm": 1.7238941192626953, + "learning_rate": 5.710515899195144e-05, + "loss": 0.0254, + "step": 5420 + }, + { + "epoch": 1.4305583020235977, + "grad_norm": 0.012462320737540722, + "learning_rate": 5.6973215463781506e-05, + "loss": 0.0018, + "step": 5425 + }, + { + "epoch": 1.4318766066838047, + "grad_norm": 0.021576853469014168, + "learning_rate": 5.684127193561156e-05, + "loss": 0.0472, + "step": 5430 + }, + { + "epoch": 1.4331949113440117, + "grad_norm": 0.2862134575843811, + "learning_rate": 5.670932840744162e-05, + "loss": 0.0258, + "step": 5435 + }, + { + "epoch": 1.4345132160042184, + "grad_norm": 0.28419312834739685, + "learning_rate": 5.657738487927168e-05, + "loss": 0.0053, + "step": 5440 + }, + { + "epoch": 1.4358315206644257, + "grad_norm": 0.013650139793753624, + "learning_rate": 5.6445441351101735e-05, + "loss": 0.0126, + "step": 5445 + }, + { + "epoch": 1.4371498253246324, + "grad_norm": 0.01203097216784954, + "learning_rate": 5.631349782293179e-05, + "loss": 0.0076, + "step": 5450 + }, + { + "epoch": 1.4384681299848394, + "grad_norm": 0.0881054624915123, + "learning_rate": 5.618155429476184e-05, + "loss": 0.0178, + "step": 5455 + }, + { + "epoch": 1.4397864346450464, + "grad_norm": 0.5258516669273376, + "learning_rate": 5.6049610766591895e-05, + "loss": 0.0112, + "step": 5460 + }, + { + "epoch": 1.4411047393052534, + "grad_norm": 0.001202153041958809, + "learning_rate": 5.591766723842196e-05, + "loss": 0.0089, + "step": 5465 + }, + { + "epoch": 1.4424230439654604, + "grad_norm": 0.4498993456363678, + "learning_rate": 5.5785723710252014e-05, + "loss": 0.0252, + "step": 5470 + }, + { + "epoch": 1.4437413486256674, + "grad_norm": 0.17477644979953766, + "learning_rate": 5.565378018208207e-05, + "loss": 0.0169, + "step": 5475 + }, + { + "epoch": 1.4450596532858744, + "grad_norm": 0.019443338736891747, + "learning_rate": 5.552183665391213e-05, + "loss": 0.0019, + "step": 5480 + }, + { + "epoch": 1.4463779579460814, + "grad_norm": 0.005653039086610079, + "learning_rate": 5.538989312574219e-05, + "loss": 0.0231, + "step": 5485 + }, + { + "epoch": 1.4476962626062884, + "grad_norm": 0.01554112322628498, + "learning_rate": 5.525794959757224e-05, + "loss": 0.0167, + "step": 5490 + }, + { + "epoch": 1.4490145672664954, + "grad_norm": 0.044272180646657944, + "learning_rate": 5.5126006069402305e-05, + "loss": 0.007, + "step": 5495 + }, + { + "epoch": 1.4503328719267023, + "grad_norm": 0.014857172966003418, + "learning_rate": 5.499406254123236e-05, + "loss": 0.0045, + "step": 5500 + }, + { + "epoch": 1.4503328719267023, + "eval_loss": 0.02392147295176983, + "eval_runtime": 452.468, + "eval_samples_per_second": 7.452, + "eval_steps_per_second": 3.726, + "step": 5500 + }, + { + "epoch": 1.4516511765869091, + "grad_norm": 0.007390835788100958, + "learning_rate": 5.486211901306241e-05, + "loss": 0.0171, + "step": 5505 + }, + { + "epoch": 1.4529694812471163, + "grad_norm": 0.0050474610179662704, + "learning_rate": 5.4730175484892466e-05, + "loss": 0.004, + "step": 5510 + }, + { + "epoch": 1.454287785907323, + "grad_norm": 0.08066163957118988, + "learning_rate": 5.459823195672252e-05, + "loss": 0.0103, + "step": 5515 + }, + { + "epoch": 1.45560609056753, + "grad_norm": 0.0062376330606639385, + "learning_rate": 5.4466288428552584e-05, + "loss": 0.0066, + "step": 5520 + }, + { + "epoch": 1.456924395227737, + "grad_norm": 0.00711809890344739, + "learning_rate": 5.433434490038264e-05, + "loss": 0.003, + "step": 5525 + }, + { + "epoch": 1.458242699887944, + "grad_norm": 0.004010149277746677, + "learning_rate": 5.4202401372212695e-05, + "loss": 0.0231, + "step": 5530 + }, + { + "epoch": 1.459561004548151, + "grad_norm": 0.4791967272758484, + "learning_rate": 5.407045784404276e-05, + "loss": 0.0277, + "step": 5535 + }, + { + "epoch": 1.460879309208358, + "grad_norm": 0.03979189693927765, + "learning_rate": 5.393851431587281e-05, + "loss": 0.0033, + "step": 5540 + }, + { + "epoch": 1.462197613868565, + "grad_norm": 0.03331119939684868, + "learning_rate": 5.380657078770287e-05, + "loss": 0.0187, + "step": 5545 + }, + { + "epoch": 1.463515918528772, + "grad_norm": 0.0042802803218364716, + "learning_rate": 5.367462725953293e-05, + "loss": 0.0032, + "step": 5550 + }, + { + "epoch": 1.464834223188979, + "grad_norm": 0.05439918115735054, + "learning_rate": 5.354268373136297e-05, + "loss": 0.0043, + "step": 5555 + }, + { + "epoch": 1.4661525278491858, + "grad_norm": 0.042643506079912186, + "learning_rate": 5.3410740203193036e-05, + "loss": 0.0059, + "step": 5560 + }, + { + "epoch": 1.467470832509393, + "grad_norm": 0.023453116416931152, + "learning_rate": 5.327879667502309e-05, + "loss": 0.0043, + "step": 5565 + }, + { + "epoch": 1.4687891371695998, + "grad_norm": 0.037712760269641876, + "learning_rate": 5.314685314685315e-05, + "loss": 0.0033, + "step": 5570 + }, + { + "epoch": 1.4701074418298068, + "grad_norm": 1.0485608577728271, + "learning_rate": 5.301490961868321e-05, + "loss": 0.0489, + "step": 5575 + }, + { + "epoch": 1.4714257464900138, + "grad_norm": 0.004728829488158226, + "learning_rate": 5.2882966090513265e-05, + "loss": 0.0067, + "step": 5580 + }, + { + "epoch": 1.4727440511502208, + "grad_norm": 0.027893677353858948, + "learning_rate": 5.275102256234332e-05, + "loss": 0.0208, + "step": 5585 + }, + { + "epoch": 1.4740623558104278, + "grad_norm": 0.02256879396736622, + "learning_rate": 5.2619079034173377e-05, + "loss": 0.0036, + "step": 5590 + }, + { + "epoch": 1.4753806604706348, + "grad_norm": 0.12636558711528778, + "learning_rate": 5.248713550600344e-05, + "loss": 0.0046, + "step": 5595 + }, + { + "epoch": 1.4766989651308418, + "grad_norm": 0.000997041119262576, + "learning_rate": 5.235519197783348e-05, + "loss": 0.0101, + "step": 5600 + }, + { + "epoch": 1.4780172697910487, + "grad_norm": 0.023494020104408264, + "learning_rate": 5.2223248449663543e-05, + "loss": 0.0039, + "step": 5605 + }, + { + "epoch": 1.4793355744512557, + "grad_norm": 0.01525307446718216, + "learning_rate": 5.20913049214936e-05, + "loss": 0.021, + "step": 5610 + }, + { + "epoch": 1.4806538791114627, + "grad_norm": 0.0024215306621044874, + "learning_rate": 5.1959361393323655e-05, + "loss": 0.0017, + "step": 5615 + }, + { + "epoch": 1.4819721837716697, + "grad_norm": 1.4708061218261719, + "learning_rate": 5.182741786515372e-05, + "loss": 0.04, + "step": 5620 + }, + { + "epoch": 1.4832904884318765, + "grad_norm": 0.015033531002700329, + "learning_rate": 5.169547433698377e-05, + "loss": 0.0042, + "step": 5625 + }, + { + "epoch": 1.4846087930920837, + "grad_norm": 0.0035444959066808224, + "learning_rate": 5.156353080881383e-05, + "loss": 0.0087, + "step": 5630 + }, + { + "epoch": 1.4859270977522905, + "grad_norm": 0.010087919421494007, + "learning_rate": 5.143158728064389e-05, + "loss": 0.0158, + "step": 5635 + }, + { + "epoch": 1.4872454024124975, + "grad_norm": 0.05779251083731651, + "learning_rate": 5.129964375247395e-05, + "loss": 0.0157, + "step": 5640 + }, + { + "epoch": 1.4885637070727045, + "grad_norm": 0.14927980303764343, + "learning_rate": 5.1167700224304e-05, + "loss": 0.0257, + "step": 5645 + }, + { + "epoch": 1.4898820117329115, + "grad_norm": 0.004252352751791477, + "learning_rate": 5.103575669613405e-05, + "loss": 0.0198, + "step": 5650 + }, + { + "epoch": 1.4912003163931185, + "grad_norm": 0.0029206848703324795, + "learning_rate": 5.090381316796411e-05, + "loss": 0.0016, + "step": 5655 + }, + { + "epoch": 1.4925186210533254, + "grad_norm": 0.005047530401498079, + "learning_rate": 5.077186963979417e-05, + "loss": 0.0023, + "step": 5660 + }, + { + "epoch": 1.4938369257135324, + "grad_norm": 0.003732564626261592, + "learning_rate": 5.0639926111624225e-05, + "loss": 0.0336, + "step": 5665 + }, + { + "epoch": 1.4951552303737394, + "grad_norm": 0.3832889497280121, + "learning_rate": 5.050798258345428e-05, + "loss": 0.0476, + "step": 5670 + }, + { + "epoch": 1.4964735350339464, + "grad_norm": 0.06733009219169617, + "learning_rate": 5.037603905528434e-05, + "loss": 0.0044, + "step": 5675 + }, + { + "epoch": 1.4977918396941532, + "grad_norm": 0.008067069575190544, + "learning_rate": 5.02440955271144e-05, + "loss": 0.0035, + "step": 5680 + }, + { + "epoch": 1.4991101443543604, + "grad_norm": 0.01706300489604473, + "learning_rate": 5.0112151998944454e-05, + "loss": 0.0031, + "step": 5685 + }, + { + "epoch": 1.5004284490145672, + "grad_norm": 0.009932024404406548, + "learning_rate": 4.998020847077451e-05, + "loss": 0.0587, + "step": 5690 + }, + { + "epoch": 1.5017467536747744, + "grad_norm": 0.006488936021924019, + "learning_rate": 4.9848264942604566e-05, + "loss": 0.002, + "step": 5695 + }, + { + "epoch": 1.5030650583349812, + "grad_norm": 0.17488756775856018, + "learning_rate": 4.971632141443462e-05, + "loss": 0.0245, + "step": 5700 + }, + { + "epoch": 1.5043833629951882, + "grad_norm": 0.3327178359031677, + "learning_rate": 4.9584377886264684e-05, + "loss": 0.0404, + "step": 5705 + }, + { + "epoch": 1.5057016676553951, + "grad_norm": 0.18467263877391815, + "learning_rate": 4.945243435809474e-05, + "loss": 0.0248, + "step": 5710 + }, + { + "epoch": 1.5070199723156021, + "grad_norm": 0.020061776041984558, + "learning_rate": 4.9320490829924795e-05, + "loss": 0.0034, + "step": 5715 + }, + { + "epoch": 1.5083382769758091, + "grad_norm": 0.0005288647953420877, + "learning_rate": 4.918854730175485e-05, + "loss": 0.0076, + "step": 5720 + }, + { + "epoch": 1.5096565816360161, + "grad_norm": 0.007515576668083668, + "learning_rate": 4.9056603773584906e-05, + "loss": 0.004, + "step": 5725 + }, + { + "epoch": 1.5109748862962231, + "grad_norm": 0.05365758761763573, + "learning_rate": 4.892466024541497e-05, + "loss": 0.0222, + "step": 5730 + }, + { + "epoch": 1.51229319095643, + "grad_norm": 0.00572391040623188, + "learning_rate": 4.8792716717245025e-05, + "loss": 0.0132, + "step": 5735 + }, + { + "epoch": 1.513611495616637, + "grad_norm": 0.21178627014160156, + "learning_rate": 4.8660773189075073e-05, + "loss": 0.0417, + "step": 5740 + }, + { + "epoch": 1.5149298002768439, + "grad_norm": 0.0641486868262291, + "learning_rate": 4.8528829660905136e-05, + "loss": 0.011, + "step": 5745 + }, + { + "epoch": 1.516248104937051, + "grad_norm": 0.04451924189925194, + "learning_rate": 4.839688613273519e-05, + "loss": 0.012, + "step": 5750 + }, + { + "epoch": 1.5175664095972579, + "grad_norm": 0.019951259717345238, + "learning_rate": 4.826494260456525e-05, + "loss": 0.009, + "step": 5755 + }, + { + "epoch": 1.5188847142574649, + "grad_norm": 0.021919893100857735, + "learning_rate": 4.813299907639531e-05, + "loss": 0.0081, + "step": 5760 + }, + { + "epoch": 1.5202030189176718, + "grad_norm": 0.5730367302894592, + "learning_rate": 4.800105554822536e-05, + "loss": 0.0254, + "step": 5765 + }, + { + "epoch": 1.5215213235778788, + "grad_norm": 0.02501523122191429, + "learning_rate": 4.786911202005542e-05, + "loss": 0.0045, + "step": 5770 + }, + { + "epoch": 1.5228396282380858, + "grad_norm": 0.01574208028614521, + "learning_rate": 4.773716849188548e-05, + "loss": 0.0081, + "step": 5775 + }, + { + "epoch": 1.5241579328982928, + "grad_norm": 0.009626791812479496, + "learning_rate": 4.760522496371553e-05, + "loss": 0.0037, + "step": 5780 + }, + { + "epoch": 1.5254762375584998, + "grad_norm": 0.535539448261261, + "learning_rate": 4.747328143554559e-05, + "loss": 0.0149, + "step": 5785 + }, + { + "epoch": 1.5267945422187066, + "grad_norm": 0.004934845492243767, + "learning_rate": 4.7341337907375644e-05, + "loss": 0.0048, + "step": 5790 + }, + { + "epoch": 1.5281128468789138, + "grad_norm": 0.009070080704987049, + "learning_rate": 4.72093943792057e-05, + "loss": 0.0028, + "step": 5795 + }, + { + "epoch": 1.5294311515391206, + "grad_norm": 0.0040720063261687756, + "learning_rate": 4.707745085103576e-05, + "loss": 0.0016, + "step": 5800 + }, + { + "epoch": 1.5307494561993278, + "grad_norm": 0.45212000608444214, + "learning_rate": 4.694550732286582e-05, + "loss": 0.0111, + "step": 5805 + }, + { + "epoch": 1.5320677608595346, + "grad_norm": 0.024048497900366783, + "learning_rate": 4.681356379469587e-05, + "loss": 0.0149, + "step": 5810 + }, + { + "epoch": 1.5333860655197418, + "grad_norm": 0.11899136006832123, + "learning_rate": 4.668162026652593e-05, + "loss": 0.0034, + "step": 5815 + }, + { + "epoch": 1.5347043701799485, + "grad_norm": 0.011249657720327377, + "learning_rate": 4.6549676738355984e-05, + "loss": 0.0052, + "step": 5820 + }, + { + "epoch": 1.5360226748401555, + "grad_norm": 0.051634710282087326, + "learning_rate": 4.641773321018604e-05, + "loss": 0.0031, + "step": 5825 + }, + { + "epoch": 1.5373409795003625, + "grad_norm": 0.3726826012134552, + "learning_rate": 4.62857896820161e-05, + "loss": 0.0582, + "step": 5830 + }, + { + "epoch": 1.5386592841605695, + "grad_norm": 0.5827310681343079, + "learning_rate": 4.615384615384616e-05, + "loss": 0.0652, + "step": 5835 + }, + { + "epoch": 1.5399775888207765, + "grad_norm": 0.006390869617462158, + "learning_rate": 4.6021902625676214e-05, + "loss": 0.0022, + "step": 5840 + }, + { + "epoch": 1.5412958934809835, + "grad_norm": 0.022760871797800064, + "learning_rate": 4.588995909750627e-05, + "loss": 0.0311, + "step": 5845 + }, + { + "epoch": 1.5426141981411905, + "grad_norm": 0.22773241996765137, + "learning_rate": 4.5758015569336325e-05, + "loss": 0.0051, + "step": 5850 + }, + { + "epoch": 1.5439325028013973, + "grad_norm": 0.015375247225165367, + "learning_rate": 4.562607204116639e-05, + "loss": 0.0023, + "step": 5855 + }, + { + "epoch": 1.5452508074616045, + "grad_norm": 0.007347101345658302, + "learning_rate": 4.549412851299644e-05, + "loss": 0.0437, + "step": 5860 + }, + { + "epoch": 1.5465691121218113, + "grad_norm": 0.012344900518655777, + "learning_rate": 4.536218498482649e-05, + "loss": 0.004, + "step": 5865 + }, + { + "epoch": 1.5478874167820185, + "grad_norm": 0.27038896083831787, + "learning_rate": 4.5230241456656555e-05, + "loss": 0.0047, + "step": 5870 + }, + { + "epoch": 1.5492057214422252, + "grad_norm": 0.016395213082432747, + "learning_rate": 4.509829792848661e-05, + "loss": 0.0026, + "step": 5875 + }, + { + "epoch": 1.5505240261024322, + "grad_norm": 0.4217267632484436, + "learning_rate": 4.4966354400316666e-05, + "loss": 0.0364, + "step": 5880 + }, + { + "epoch": 1.5518423307626392, + "grad_norm": 0.20046105980873108, + "learning_rate": 4.483441087214673e-05, + "loss": 0.0243, + "step": 5885 + }, + { + "epoch": 1.5531606354228462, + "grad_norm": 0.004307698458433151, + "learning_rate": 4.470246734397678e-05, + "loss": 0.0064, + "step": 5890 + }, + { + "epoch": 1.5544789400830532, + "grad_norm": 0.46102187037467957, + "learning_rate": 4.457052381580683e-05, + "loss": 0.0115, + "step": 5895 + }, + { + "epoch": 1.5557972447432602, + "grad_norm": 0.0689118504524231, + "learning_rate": 4.4438580287636895e-05, + "loss": 0.0334, + "step": 5900 + }, + { + "epoch": 1.5571155494034672, + "grad_norm": 0.003091114340350032, + "learning_rate": 4.430663675946695e-05, + "loss": 0.0246, + "step": 5905 + }, + { + "epoch": 1.558433854063674, + "grad_norm": 0.003877349430695176, + "learning_rate": 4.417469323129701e-05, + "loss": 0.0032, + "step": 5910 + }, + { + "epoch": 1.5597521587238812, + "grad_norm": 0.30713143944740295, + "learning_rate": 4.404274970312706e-05, + "loss": 0.0229, + "step": 5915 + }, + { + "epoch": 1.561070463384088, + "grad_norm": 0.07344445586204529, + "learning_rate": 4.391080617495712e-05, + "loss": 0.0078, + "step": 5920 + }, + { + "epoch": 1.5623887680442952, + "grad_norm": 0.01774723082780838, + "learning_rate": 4.377886264678718e-05, + "loss": 0.0034, + "step": 5925 + }, + { + "epoch": 1.563707072704502, + "grad_norm": 0.476324200630188, + "learning_rate": 4.3646919118617236e-05, + "loss": 0.0071, + "step": 5930 + }, + { + "epoch": 1.5650253773647091, + "grad_norm": 0.11624465882778168, + "learning_rate": 4.351497559044729e-05, + "loss": 0.0236, + "step": 5935 + }, + { + "epoch": 1.566343682024916, + "grad_norm": 0.190691277384758, + "learning_rate": 4.338303206227735e-05, + "loss": 0.006, + "step": 5940 + }, + { + "epoch": 1.567661986685123, + "grad_norm": 0.20517045259475708, + "learning_rate": 4.32510885341074e-05, + "loss": 0.009, + "step": 5945 + }, + { + "epoch": 1.56898029134533, + "grad_norm": 0.008122317492961884, + "learning_rate": 4.311914500593746e-05, + "loss": 0.0041, + "step": 5950 + }, + { + "epoch": 1.570298596005537, + "grad_norm": 0.01982291042804718, + "learning_rate": 4.298720147776752e-05, + "loss": 0.0258, + "step": 5955 + }, + { + "epoch": 1.5716169006657439, + "grad_norm": 0.000996922142803669, + "learning_rate": 4.285525794959758e-05, + "loss": 0.0233, + "step": 5960 + }, + { + "epoch": 1.5729352053259509, + "grad_norm": 0.09725592285394669, + "learning_rate": 4.272331442142763e-05, + "loss": 0.0218, + "step": 5965 + }, + { + "epoch": 1.5742535099861579, + "grad_norm": 0.0672350749373436, + "learning_rate": 4.259137089325769e-05, + "loss": 0.0194, + "step": 5970 + }, + { + "epoch": 1.5755718146463646, + "grad_norm": 0.014844833873212337, + "learning_rate": 4.2459427365087744e-05, + "loss": 0.0298, + "step": 5975 + }, + { + "epoch": 1.5768901193065719, + "grad_norm": 0.030519040301442146, + "learning_rate": 4.2327483836917806e-05, + "loss": 0.0178, + "step": 5980 + }, + { + "epoch": 1.5782084239667786, + "grad_norm": 0.018561460077762604, + "learning_rate": 4.219554030874786e-05, + "loss": 0.0154, + "step": 5985 + }, + { + "epoch": 1.5795267286269858, + "grad_norm": 0.02470085583627224, + "learning_rate": 4.206359678057791e-05, + "loss": 0.0361, + "step": 5990 + }, + { + "epoch": 1.5808450332871926, + "grad_norm": 0.055412422865629196, + "learning_rate": 4.193165325240797e-05, + "loss": 0.0162, + "step": 5995 + }, + { + "epoch": 1.5821633379473996, + "grad_norm": 0.0034158769994974136, + "learning_rate": 4.179970972423803e-05, + "loss": 0.0068, + "step": 6000 + }, + { + "epoch": 1.5821633379473996, + "eval_loss": 0.024797894060611725, + "eval_runtime": 452.1611, + "eval_samples_per_second": 7.458, + "eval_steps_per_second": 3.729, + "step": 6000 + }, + { + "epoch": 1.5834816426076066, + "grad_norm": 0.01284120511263609, + "learning_rate": 4.1667766196068085e-05, + "loss": 0.0036, + "step": 6005 + }, + { + "epoch": 1.5847999472678136, + "grad_norm": 0.01274865586310625, + "learning_rate": 4.153582266789815e-05, + "loss": 0.0447, + "step": 6010 + }, + { + "epoch": 1.5861182519280206, + "grad_norm": 0.03555435314774513, + "learning_rate": 4.1403879139728196e-05, + "loss": 0.0078, + "step": 6015 + }, + { + "epoch": 1.5874365565882276, + "grad_norm": 0.0011938117677345872, + "learning_rate": 4.127193561155825e-05, + "loss": 0.0136, + "step": 6020 + }, + { + "epoch": 1.5887548612484346, + "grad_norm": 0.9741255640983582, + "learning_rate": 4.1139992083388314e-05, + "loss": 0.0153, + "step": 6025 + }, + { + "epoch": 1.5900731659086413, + "grad_norm": 0.011220674030482769, + "learning_rate": 4.100804855521837e-05, + "loss": 0.0262, + "step": 6030 + }, + { + "epoch": 1.5913914705688486, + "grad_norm": 0.021556466817855835, + "learning_rate": 4.0876105027048425e-05, + "loss": 0.0044, + "step": 6035 + }, + { + "epoch": 1.5927097752290553, + "grad_norm": 0.2725502848625183, + "learning_rate": 4.074416149887848e-05, + "loss": 0.0558, + "step": 6040 + }, + { + "epoch": 1.5940280798892625, + "grad_norm": 0.6407182216644287, + "learning_rate": 4.0612217970708537e-05, + "loss": 0.0261, + "step": 6045 + }, + { + "epoch": 1.5953463845494693, + "grad_norm": 0.0024960115551948547, + "learning_rate": 4.04802744425386e-05, + "loss": 0.0128, + "step": 6050 + }, + { + "epoch": 1.5966646892096763, + "grad_norm": 0.11380109190940857, + "learning_rate": 4.0348330914368655e-05, + "loss": 0.0199, + "step": 6055 + }, + { + "epoch": 1.5979829938698833, + "grad_norm": 0.18358005583286285, + "learning_rate": 4.0216387386198704e-05, + "loss": 0.0083, + "step": 6060 + }, + { + "epoch": 1.5993012985300903, + "grad_norm": 0.06412303447723389, + "learning_rate": 4.0084443858028766e-05, + "loss": 0.0548, + "step": 6065 + }, + { + "epoch": 1.6006196031902973, + "grad_norm": 0.6999421119689941, + "learning_rate": 3.995250032985882e-05, + "loss": 0.0074, + "step": 6070 + }, + { + "epoch": 1.6019379078505043, + "grad_norm": 0.18698133528232574, + "learning_rate": 3.982055680168888e-05, + "loss": 0.0542, + "step": 6075 + }, + { + "epoch": 1.6032562125107113, + "grad_norm": 0.014717207290232182, + "learning_rate": 3.968861327351894e-05, + "loss": 0.0071, + "step": 6080 + }, + { + "epoch": 1.604574517170918, + "grad_norm": 0.0765385851264, + "learning_rate": 3.955666974534899e-05, + "loss": 0.0063, + "step": 6085 + }, + { + "epoch": 1.6058928218311253, + "grad_norm": 0.4332450330257416, + "learning_rate": 3.9424726217179044e-05, + "loss": 0.0071, + "step": 6090 + }, + { + "epoch": 1.607211126491332, + "grad_norm": 0.003700035158544779, + "learning_rate": 3.929278268900911e-05, + "loss": 0.0052, + "step": 6095 + }, + { + "epoch": 1.6085294311515392, + "grad_norm": 0.02500278130173683, + "learning_rate": 3.916083916083916e-05, + "loss": 0.0387, + "step": 6100 + }, + { + "epoch": 1.609847735811746, + "grad_norm": 0.023568281903862953, + "learning_rate": 3.902889563266922e-05, + "loss": 0.0594, + "step": 6105 + }, + { + "epoch": 1.6111660404719532, + "grad_norm": 0.02687825821340084, + "learning_rate": 3.8896952104499274e-05, + "loss": 0.0229, + "step": 6110 + }, + { + "epoch": 1.61248434513216, + "grad_norm": 0.005178579594939947, + "learning_rate": 3.876500857632933e-05, + "loss": 0.0293, + "step": 6115 + }, + { + "epoch": 1.613802649792367, + "grad_norm": 0.3987988531589508, + "learning_rate": 3.863306504815939e-05, + "loss": 0.015, + "step": 6120 + }, + { + "epoch": 1.615120954452574, + "grad_norm": 0.18915466964244843, + "learning_rate": 3.850112151998945e-05, + "loss": 0.023, + "step": 6125 + }, + { + "epoch": 1.616439259112781, + "grad_norm": 0.015252528712153435, + "learning_rate": 3.83691779918195e-05, + "loss": 0.0185, + "step": 6130 + }, + { + "epoch": 1.617757563772988, + "grad_norm": 0.04947187379002571, + "learning_rate": 3.823723446364956e-05, + "loss": 0.0131, + "step": 6135 + }, + { + "epoch": 1.619075868433195, + "grad_norm": 0.017095958814024925, + "learning_rate": 3.8105290935479615e-05, + "loss": 0.0071, + "step": 6140 + }, + { + "epoch": 1.620394173093402, + "grad_norm": 0.013050337322056293, + "learning_rate": 3.797334740730967e-05, + "loss": 0.0038, + "step": 6145 + }, + { + "epoch": 1.6217124777536087, + "grad_norm": 0.08132806420326233, + "learning_rate": 3.784140387913973e-05, + "loss": 0.0043, + "step": 6150 + }, + { + "epoch": 1.623030782413816, + "grad_norm": 0.020741304382681847, + "learning_rate": 3.770946035096979e-05, + "loss": 0.006, + "step": 6155 + }, + { + "epoch": 1.6243490870740227, + "grad_norm": 0.0576217919588089, + "learning_rate": 3.7577516822799844e-05, + "loss": 0.0033, + "step": 6160 + }, + { + "epoch": 1.62566739173423, + "grad_norm": 0.03032900020480156, + "learning_rate": 3.74455732946299e-05, + "loss": 0.0318, + "step": 6165 + }, + { + "epoch": 1.6269856963944367, + "grad_norm": 0.8868799209594727, + "learning_rate": 3.7313629766459955e-05, + "loss": 0.0304, + "step": 6170 + }, + { + "epoch": 1.6283040010546437, + "grad_norm": 0.003816834883764386, + "learning_rate": 3.718168623829002e-05, + "loss": 0.003, + "step": 6175 + }, + { + "epoch": 1.6296223057148507, + "grad_norm": 0.05368296429514885, + "learning_rate": 3.704974271012007e-05, + "loss": 0.0064, + "step": 6180 + }, + { + "epoch": 1.6309406103750577, + "grad_norm": 0.09963366389274597, + "learning_rate": 3.691779918195012e-05, + "loss": 0.0097, + "step": 6185 + }, + { + "epoch": 1.6322589150352647, + "grad_norm": 0.006273225415498018, + "learning_rate": 3.6785855653780185e-05, + "loss": 0.0071, + "step": 6190 + }, + { + "epoch": 1.6335772196954716, + "grad_norm": 0.15079188346862793, + "learning_rate": 3.665391212561024e-05, + "loss": 0.0058, + "step": 6195 + }, + { + "epoch": 1.6348955243556786, + "grad_norm": 0.004980973433703184, + "learning_rate": 3.6521968597440296e-05, + "loss": 0.0051, + "step": 6200 + }, + { + "epoch": 1.6362138290158854, + "grad_norm": 0.004235363099724054, + "learning_rate": 3.639002506927036e-05, + "loss": 0.0028, + "step": 6205 + }, + { + "epoch": 1.6375321336760926, + "grad_norm": 0.003829963505268097, + "learning_rate": 3.625808154110041e-05, + "loss": 0.0347, + "step": 6210 + }, + { + "epoch": 1.6388504383362994, + "grad_norm": 0.021650686860084534, + "learning_rate": 3.612613801293046e-05, + "loss": 0.0036, + "step": 6215 + }, + { + "epoch": 1.6401687429965066, + "grad_norm": 0.06326934695243835, + "learning_rate": 3.5994194484760525e-05, + "loss": 0.0228, + "step": 6220 + }, + { + "epoch": 1.6414870476567134, + "grad_norm": 0.017276322469115257, + "learning_rate": 3.586225095659058e-05, + "loss": 0.0025, + "step": 6225 + }, + { + "epoch": 1.6428053523169206, + "grad_norm": 0.005066063720732927, + "learning_rate": 3.573030742842064e-05, + "loss": 0.0047, + "step": 6230 + }, + { + "epoch": 1.6441236569771274, + "grad_norm": 0.003512267954647541, + "learning_rate": 3.559836390025069e-05, + "loss": 0.0018, + "step": 6235 + }, + { + "epoch": 1.6454419616373344, + "grad_norm": 0.004347699694335461, + "learning_rate": 3.546642037208075e-05, + "loss": 0.0045, + "step": 6240 + }, + { + "epoch": 1.6467602662975414, + "grad_norm": 0.008277533575892448, + "learning_rate": 3.533447684391081e-05, + "loss": 0.0456, + "step": 6245 + }, + { + "epoch": 1.6480785709577483, + "grad_norm": 0.00973033718764782, + "learning_rate": 3.5202533315740866e-05, + "loss": 0.0215, + "step": 6250 + }, + { + "epoch": 1.6493968756179553, + "grad_norm": 1.9432978630065918, + "learning_rate": 3.507058978757092e-05, + "loss": 0.0132, + "step": 6255 + }, + { + "epoch": 1.6507151802781623, + "grad_norm": 0.2693535387516022, + "learning_rate": 3.493864625940098e-05, + "loss": 0.0037, + "step": 6260 + }, + { + "epoch": 1.6520334849383693, + "grad_norm": 0.02107766456902027, + "learning_rate": 3.480670273123103e-05, + "loss": 0.0031, + "step": 6265 + }, + { + "epoch": 1.653351789598576, + "grad_norm": 0.07168436795473099, + "learning_rate": 3.467475920306109e-05, + "loss": 0.0101, + "step": 6270 + }, + { + "epoch": 1.6546700942587833, + "grad_norm": 0.06479799002408981, + "learning_rate": 3.454281567489115e-05, + "loss": 0.0032, + "step": 6275 + }, + { + "epoch": 1.65598839891899, + "grad_norm": 0.0013557536294683814, + "learning_rate": 3.441087214672121e-05, + "loss": 0.0037, + "step": 6280 + }, + { + "epoch": 1.6573067035791973, + "grad_norm": 0.07330150157213211, + "learning_rate": 3.427892861855126e-05, + "loss": 0.0031, + "step": 6285 + }, + { + "epoch": 1.658625008239404, + "grad_norm": 0.08246012777090073, + "learning_rate": 3.414698509038132e-05, + "loss": 0.0028, + "step": 6290 + }, + { + "epoch": 1.659943312899611, + "grad_norm": 0.6232367157936096, + "learning_rate": 3.4015041562211374e-05, + "loss": 0.0042, + "step": 6295 + }, + { + "epoch": 1.661261617559818, + "grad_norm": 0.007676729932427406, + "learning_rate": 3.388309803404143e-05, + "loss": 0.0501, + "step": 6300 + }, + { + "epoch": 1.662579922220025, + "grad_norm": 0.02081216312944889, + "learning_rate": 3.375115450587149e-05, + "loss": 0.0047, + "step": 6305 + }, + { + "epoch": 1.663898226880232, + "grad_norm": 0.008829087018966675, + "learning_rate": 3.361921097770154e-05, + "loss": 0.0298, + "step": 6310 + }, + { + "epoch": 1.665216531540439, + "grad_norm": 0.4426127076148987, + "learning_rate": 3.34872674495316e-05, + "loss": 0.0045, + "step": 6315 + }, + { + "epoch": 1.666534836200646, + "grad_norm": 0.025818035006523132, + "learning_rate": 3.335532392136166e-05, + "loss": 0.0028, + "step": 6320 + }, + { + "epoch": 1.6678531408608528, + "grad_norm": 0.6068133115768433, + "learning_rate": 3.3223380393191715e-05, + "loss": 0.0202, + "step": 6325 + }, + { + "epoch": 1.66917144552106, + "grad_norm": 0.02740122564136982, + "learning_rate": 3.309143686502178e-05, + "loss": 0.0025, + "step": 6330 + }, + { + "epoch": 1.6704897501812668, + "grad_norm": 0.15878735482692719, + "learning_rate": 3.2959493336851826e-05, + "loss": 0.004, + "step": 6335 + }, + { + "epoch": 1.671808054841474, + "grad_norm": 0.006827466655522585, + "learning_rate": 3.282754980868188e-05, + "loss": 0.0048, + "step": 6340 + }, + { + "epoch": 1.6731263595016808, + "grad_norm": 0.19508551061153412, + "learning_rate": 3.2695606280511944e-05, + "loss": 0.0025, + "step": 6345 + }, + { + "epoch": 1.674444664161888, + "grad_norm": 0.8176754713058472, + "learning_rate": 3.2563662752342e-05, + "loss": 0.0151, + "step": 6350 + }, + { + "epoch": 1.6757629688220947, + "grad_norm": 0.011672024615108967, + "learning_rate": 3.2431719224172055e-05, + "loss": 0.0452, + "step": 6355 + }, + { + "epoch": 1.6770812734823017, + "grad_norm": 0.015824951231479645, + "learning_rate": 3.229977569600211e-05, + "loss": 0.0236, + "step": 6360 + }, + { + "epoch": 1.6783995781425087, + "grad_norm": 0.1358737051486969, + "learning_rate": 3.216783216783217e-05, + "loss": 0.0078, + "step": 6365 + }, + { + "epoch": 1.6797178828027157, + "grad_norm": 0.004896901547908783, + "learning_rate": 3.203588863966223e-05, + "loss": 0.0042, + "step": 6370 + }, + { + "epoch": 1.6810361874629227, + "grad_norm": 0.22593103349208832, + "learning_rate": 3.1903945111492285e-05, + "loss": 0.0053, + "step": 6375 + }, + { + "epoch": 1.6823544921231297, + "grad_norm": 0.0073196059092879295, + "learning_rate": 3.177200158332234e-05, + "loss": 0.0287, + "step": 6380 + }, + { + "epoch": 1.6836727967833367, + "grad_norm": 0.018524926155805588, + "learning_rate": 3.1640058055152396e-05, + "loss": 0.0122, + "step": 6385 + }, + { + "epoch": 1.6849911014435435, + "grad_norm": 0.7453815937042236, + "learning_rate": 3.150811452698245e-05, + "loss": 0.0378, + "step": 6390 + }, + { + "epoch": 1.6863094061037507, + "grad_norm": 0.22409795224666595, + "learning_rate": 3.137617099881251e-05, + "loss": 0.0282, + "step": 6395 + }, + { + "epoch": 1.6876277107639575, + "grad_norm": 0.005432693753391504, + "learning_rate": 3.124422747064257e-05, + "loss": 0.0162, + "step": 6400 + }, + { + "epoch": 1.6889460154241647, + "grad_norm": 0.1493055820465088, + "learning_rate": 3.1112283942472626e-05, + "loss": 0.0123, + "step": 6405 + }, + { + "epoch": 1.6902643200843714, + "grad_norm": 0.1638440042734146, + "learning_rate": 3.0980340414302674e-05, + "loss": 0.0058, + "step": 6410 + }, + { + "epoch": 1.6915826247445784, + "grad_norm": 0.015779908746480942, + "learning_rate": 3.084839688613274e-05, + "loss": 0.0157, + "step": 6415 + }, + { + "epoch": 1.6929009294047854, + "grad_norm": 0.0012348912423476577, + "learning_rate": 3.071645335796279e-05, + "loss": 0.0016, + "step": 6420 + }, + { + "epoch": 1.6942192340649924, + "grad_norm": 0.05294624716043472, + "learning_rate": 3.058450982979285e-05, + "loss": 0.0037, + "step": 6425 + }, + { + "epoch": 1.6955375387251994, + "grad_norm": 0.01926981844007969, + "learning_rate": 3.045256630162291e-05, + "loss": 0.0053, + "step": 6430 + }, + { + "epoch": 1.6968558433854064, + "grad_norm": 0.005958891473710537, + "learning_rate": 3.0320622773452963e-05, + "loss": 0.0025, + "step": 6435 + }, + { + "epoch": 1.6981741480456134, + "grad_norm": 0.001902201445773244, + "learning_rate": 3.018867924528302e-05, + "loss": 0.0027, + "step": 6440 + }, + { + "epoch": 1.6994924527058202, + "grad_norm": 0.036614127457141876, + "learning_rate": 3.0056735717113078e-05, + "loss": 0.0026, + "step": 6445 + }, + { + "epoch": 1.7008107573660274, + "grad_norm": 0.07294526696205139, + "learning_rate": 2.9924792188943133e-05, + "loss": 0.0042, + "step": 6450 + }, + { + "epoch": 1.7021290620262342, + "grad_norm": 0.42822372913360596, + "learning_rate": 2.9792848660773192e-05, + "loss": 0.013, + "step": 6455 + }, + { + "epoch": 1.7034473666864414, + "grad_norm": 0.036622967571020126, + "learning_rate": 2.9660905132603245e-05, + "loss": 0.0029, + "step": 6460 + }, + { + "epoch": 1.7047656713466481, + "grad_norm": 0.08314034342765808, + "learning_rate": 2.9528961604433304e-05, + "loss": 0.0043, + "step": 6465 + }, + { + "epoch": 1.7060839760068551, + "grad_norm": 0.0005654952838085592, + "learning_rate": 2.939701807626336e-05, + "loss": 0.0595, + "step": 6470 + }, + { + "epoch": 1.7074022806670621, + "grad_norm": 0.004545385017991066, + "learning_rate": 2.926507454809342e-05, + "loss": 0.0044, + "step": 6475 + }, + { + "epoch": 1.7087205853272691, + "grad_norm": 0.00033831383916549385, + "learning_rate": 2.9133131019923477e-05, + "loss": 0.0046, + "step": 6480 + }, + { + "epoch": 1.710038889987476, + "grad_norm": 0.0019903562497347593, + "learning_rate": 2.900118749175353e-05, + "loss": 0.0026, + "step": 6485 + }, + { + "epoch": 1.711357194647683, + "grad_norm": 0.10188104957342148, + "learning_rate": 2.8869243963583585e-05, + "loss": 0.0069, + "step": 6490 + }, + { + "epoch": 1.71267549930789, + "grad_norm": 0.2123432606458664, + "learning_rate": 2.8737300435413644e-05, + "loss": 0.0199, + "step": 6495 + }, + { + "epoch": 1.7139938039680969, + "grad_norm": 0.43209517002105713, + "learning_rate": 2.8605356907243703e-05, + "loss": 0.0099, + "step": 6500 + }, + { + "epoch": 1.7139938039680969, + "eval_loss": 0.024327505379915237, + "eval_runtime": 452.0052, + "eval_samples_per_second": 7.46, + "eval_steps_per_second": 3.73, + "step": 6500 + }, + { + "epoch": 1.715312108628304, + "grad_norm": 0.009868285618722439, + "learning_rate": 2.847341337907376e-05, + "loss": 0.0025, + "step": 6505 + }, + { + "epoch": 1.7166304132885108, + "grad_norm": 0.00778606254607439, + "learning_rate": 2.834146985090381e-05, + "loss": 0.0028, + "step": 6510 + }, + { + "epoch": 1.717948717948718, + "grad_norm": 0.02987460047006607, + "learning_rate": 2.820952632273387e-05, + "loss": 0.0068, + "step": 6515 + }, + { + "epoch": 1.7192670226089248, + "grad_norm": 0.04475142061710358, + "learning_rate": 2.807758279456393e-05, + "loss": 0.0022, + "step": 6520 + }, + { + "epoch": 1.720585327269132, + "grad_norm": 0.12720516324043274, + "learning_rate": 2.7945639266393985e-05, + "loss": 0.0488, + "step": 6525 + }, + { + "epoch": 1.7219036319293388, + "grad_norm": 0.0011463731061667204, + "learning_rate": 2.7813695738224044e-05, + "loss": 0.0023, + "step": 6530 + }, + { + "epoch": 1.7232219365895458, + "grad_norm": 0.008907752111554146, + "learning_rate": 2.7681752210054096e-05, + "loss": 0.0039, + "step": 6535 + }, + { + "epoch": 1.7245402412497528, + "grad_norm": 0.008416680619120598, + "learning_rate": 2.7549808681884156e-05, + "loss": 0.0055, + "step": 6540 + }, + { + "epoch": 1.7258585459099598, + "grad_norm": 0.26278871297836304, + "learning_rate": 2.741786515371421e-05, + "loss": 0.0386, + "step": 6545 + }, + { + "epoch": 1.7271768505701668, + "grad_norm": 0.01750275492668152, + "learning_rate": 2.728592162554427e-05, + "loss": 0.0048, + "step": 6550 + }, + { + "epoch": 1.7284951552303738, + "grad_norm": 0.009483959525823593, + "learning_rate": 2.7153978097374326e-05, + "loss": 0.0061, + "step": 6555 + }, + { + "epoch": 1.7298134598905808, + "grad_norm": 0.016591722145676613, + "learning_rate": 2.7022034569204378e-05, + "loss": 0.0058, + "step": 6560 + }, + { + "epoch": 1.7311317645507875, + "grad_norm": 0.5120682716369629, + "learning_rate": 2.6890091041034437e-05, + "loss": 0.0229, + "step": 6565 + }, + { + "epoch": 1.7324500692109948, + "grad_norm": 0.03748248517513275, + "learning_rate": 2.6758147512864496e-05, + "loss": 0.0026, + "step": 6570 + }, + { + "epoch": 1.7337683738712015, + "grad_norm": 0.08328749984502792, + "learning_rate": 2.6626203984694552e-05, + "loss": 0.0052, + "step": 6575 + }, + { + "epoch": 1.7350866785314087, + "grad_norm": 0.012284482829272747, + "learning_rate": 2.649426045652461e-05, + "loss": 0.0353, + "step": 6580 + }, + { + "epoch": 1.7364049831916155, + "grad_norm": 0.06362583488225937, + "learning_rate": 2.6362316928354663e-05, + "loss": 0.0309, + "step": 6585 + }, + { + "epoch": 1.7377232878518225, + "grad_norm": 0.01475360058248043, + "learning_rate": 2.6230373400184722e-05, + "loss": 0.0034, + "step": 6590 + }, + { + "epoch": 1.7390415925120295, + "grad_norm": 0.002241638721898198, + "learning_rate": 2.6098429872014778e-05, + "loss": 0.0365, + "step": 6595 + }, + { + "epoch": 1.7403598971722365, + "grad_norm": 0.11375941336154938, + "learning_rate": 2.5966486343844837e-05, + "loss": 0.0241, + "step": 6600 + }, + { + "epoch": 1.7416782018324435, + "grad_norm": 0.009631779976189137, + "learning_rate": 2.5834542815674896e-05, + "loss": 0.0026, + "step": 6605 + }, + { + "epoch": 1.7429965064926505, + "grad_norm": 0.12113262712955475, + "learning_rate": 2.570259928750495e-05, + "loss": 0.0207, + "step": 6610 + }, + { + "epoch": 1.7443148111528575, + "grad_norm": 0.006536155007779598, + "learning_rate": 2.5570655759335004e-05, + "loss": 0.0022, + "step": 6615 + }, + { + "epoch": 1.7456331158130642, + "grad_norm": 0.043030887842178345, + "learning_rate": 2.5438712231165063e-05, + "loss": 0.003, + "step": 6620 + }, + { + "epoch": 1.7469514204732715, + "grad_norm": 0.00860620103776455, + "learning_rate": 2.5306768702995122e-05, + "loss": 0.027, + "step": 6625 + }, + { + "epoch": 1.7482697251334782, + "grad_norm": 0.014589210972189903, + "learning_rate": 2.5174825174825178e-05, + "loss": 0.0224, + "step": 6630 + }, + { + "epoch": 1.7495880297936854, + "grad_norm": 0.01215316355228424, + "learning_rate": 2.504288164665523e-05, + "loss": 0.011, + "step": 6635 + }, + { + "epoch": 1.7509063344538922, + "grad_norm": 0.10951556265354156, + "learning_rate": 2.491093811848529e-05, + "loss": 0.0384, + "step": 6640 + }, + { + "epoch": 1.7522246391140994, + "grad_norm": 0.30859875679016113, + "learning_rate": 2.4778994590315345e-05, + "loss": 0.0031, + "step": 6645 + }, + { + "epoch": 1.7535429437743062, + "grad_norm": 0.025427229702472687, + "learning_rate": 2.4647051062145404e-05, + "loss": 0.0171, + "step": 6650 + }, + { + "epoch": 1.7548612484345132, + "grad_norm": 0.03334197774529457, + "learning_rate": 2.451510753397546e-05, + "loss": 0.0473, + "step": 6655 + }, + { + "epoch": 1.7561795530947202, + "grad_norm": 0.013445639982819557, + "learning_rate": 2.438316400580552e-05, + "loss": 0.0056, + "step": 6660 + }, + { + "epoch": 1.7574978577549272, + "grad_norm": 0.008306960575282574, + "learning_rate": 2.425122047763557e-05, + "loss": 0.0104, + "step": 6665 + }, + { + "epoch": 1.7588161624151342, + "grad_norm": 0.012615012936294079, + "learning_rate": 2.411927694946563e-05, + "loss": 0.0097, + "step": 6670 + }, + { + "epoch": 1.7601344670753412, + "grad_norm": 0.006827410310506821, + "learning_rate": 2.398733342129569e-05, + "loss": 0.0057, + "step": 6675 + }, + { + "epoch": 1.7614527717355482, + "grad_norm": 0.017035294324159622, + "learning_rate": 2.3855389893125745e-05, + "loss": 0.0035, + "step": 6680 + }, + { + "epoch": 1.762771076395755, + "grad_norm": 0.036102693527936935, + "learning_rate": 2.37234463649558e-05, + "loss": 0.0031, + "step": 6685 + }, + { + "epoch": 1.7640893810559621, + "grad_norm": 0.5004498958587646, + "learning_rate": 2.3591502836785856e-05, + "loss": 0.0217, + "step": 6690 + }, + { + "epoch": 1.765407685716169, + "grad_norm": 0.017726672813296318, + "learning_rate": 2.3459559308615915e-05, + "loss": 0.0112, + "step": 6695 + }, + { + "epoch": 1.7667259903763761, + "grad_norm": 0.00940331444144249, + "learning_rate": 2.332761578044597e-05, + "loss": 0.0107, + "step": 6700 + }, + { + "epoch": 1.768044295036583, + "grad_norm": 0.007495497819036245, + "learning_rate": 2.3195672252276026e-05, + "loss": 0.0032, + "step": 6705 + }, + { + "epoch": 1.7693625996967899, + "grad_norm": 0.6863199472427368, + "learning_rate": 2.3063728724106085e-05, + "loss": 0.034, + "step": 6710 + }, + { + "epoch": 1.7706809043569969, + "grad_norm": 0.004587489180266857, + "learning_rate": 2.293178519593614e-05, + "loss": 0.0032, + "step": 6715 + }, + { + "epoch": 1.7719992090172039, + "grad_norm": 0.017706016078591347, + "learning_rate": 2.2799841667766197e-05, + "loss": 0.0036, + "step": 6720 + }, + { + "epoch": 1.7733175136774109, + "grad_norm": 0.012740216217935085, + "learning_rate": 2.2667898139596252e-05, + "loss": 0.0147, + "step": 6725 + }, + { + "epoch": 1.7746358183376179, + "grad_norm": 0.010391579940915108, + "learning_rate": 2.253595461142631e-05, + "loss": 0.0041, + "step": 6730 + }, + { + "epoch": 1.7759541229978248, + "grad_norm": 0.021570540964603424, + "learning_rate": 2.2404011083256367e-05, + "loss": 0.0363, + "step": 6735 + }, + { + "epoch": 1.7772724276580316, + "grad_norm": 0.005778402555733919, + "learning_rate": 2.2272067555086423e-05, + "loss": 0.002, + "step": 6740 + }, + { + "epoch": 1.7785907323182388, + "grad_norm": 0.0, + "learning_rate": 2.2140124026916482e-05, + "loss": 0.0058, + "step": 6745 + }, + { + "epoch": 1.7799090369784456, + "grad_norm": 0.010869967751204967, + "learning_rate": 2.2008180498746537e-05, + "loss": 0.0036, + "step": 6750 + }, + { + "epoch": 1.7812273416386528, + "grad_norm": 0.04336518794298172, + "learning_rate": 2.1876236970576593e-05, + "loss": 0.0074, + "step": 6755 + }, + { + "epoch": 1.7825456462988596, + "grad_norm": 0.008664094842970371, + "learning_rate": 2.1744293442406652e-05, + "loss": 0.0027, + "step": 6760 + }, + { + "epoch": 1.7838639509590668, + "grad_norm": 0.9408183097839355, + "learning_rate": 2.1612349914236708e-05, + "loss": 0.0371, + "step": 6765 + }, + { + "epoch": 1.7851822556192736, + "grad_norm": 0.016822539269924164, + "learning_rate": 2.1480406386066763e-05, + "loss": 0.0137, + "step": 6770 + }, + { + "epoch": 1.7865005602794806, + "grad_norm": 0.00829544197767973, + "learning_rate": 2.134846285789682e-05, + "loss": 0.0134, + "step": 6775 + }, + { + "epoch": 1.7878188649396876, + "grad_norm": 0.0035508016590029, + "learning_rate": 2.1216519329726878e-05, + "loss": 0.0231, + "step": 6780 + }, + { + "epoch": 1.7891371695998946, + "grad_norm": 0.13871321082115173, + "learning_rate": 2.1084575801556937e-05, + "loss": 0.0296, + "step": 6785 + }, + { + "epoch": 1.7904554742601015, + "grad_norm": 0.002578354673460126, + "learning_rate": 2.095263227338699e-05, + "loss": 0.0178, + "step": 6790 + }, + { + "epoch": 1.7917737789203085, + "grad_norm": 0.5279458165168762, + "learning_rate": 2.082068874521705e-05, + "loss": 0.0336, + "step": 6795 + }, + { + "epoch": 1.7930920835805155, + "grad_norm": 0.0017439400544390082, + "learning_rate": 2.0688745217047104e-05, + "loss": 0.0031, + "step": 6800 + }, + { + "epoch": 1.7944103882407223, + "grad_norm": 0.007989778183400631, + "learning_rate": 2.055680168887716e-05, + "loss": 0.0081, + "step": 6805 + }, + { + "epoch": 1.7957286929009295, + "grad_norm": 0.015163813717663288, + "learning_rate": 2.042485816070722e-05, + "loss": 0.0234, + "step": 6810 + }, + { + "epoch": 1.7970469975611363, + "grad_norm": 0.10615389794111252, + "learning_rate": 2.0292914632537275e-05, + "loss": 0.0144, + "step": 6815 + }, + { + "epoch": 1.7983653022213435, + "grad_norm": 0.03466172143816948, + "learning_rate": 2.0160971104367334e-05, + "loss": 0.0036, + "step": 6820 + }, + { + "epoch": 1.7996836068815503, + "grad_norm": 0.047511328011751175, + "learning_rate": 2.0029027576197386e-05, + "loss": 0.002, + "step": 6825 + }, + { + "epoch": 1.8010019115417573, + "grad_norm": 0.019772246479988098, + "learning_rate": 1.9897084048027445e-05, + "loss": 0.0049, + "step": 6830 + }, + { + "epoch": 1.8023202162019643, + "grad_norm": 0.1156701073050499, + "learning_rate": 1.9765140519857504e-05, + "loss": 0.0033, + "step": 6835 + }, + { + "epoch": 1.8036385208621712, + "grad_norm": 0.010991690680384636, + "learning_rate": 1.963319699168756e-05, + "loss": 0.0036, + "step": 6840 + }, + { + "epoch": 1.8049568255223782, + "grad_norm": 0.29658815264701843, + "learning_rate": 1.9501253463517615e-05, + "loss": 0.0042, + "step": 6845 + }, + { + "epoch": 1.8062751301825852, + "grad_norm": 0.056147243827581406, + "learning_rate": 1.936930993534767e-05, + "loss": 0.0052, + "step": 6850 + }, + { + "epoch": 1.8075934348427922, + "grad_norm": 0.010382590815424919, + "learning_rate": 1.923736640717773e-05, + "loss": 0.0033, + "step": 6855 + }, + { + "epoch": 1.808911739502999, + "grad_norm": 1.1247020959854126, + "learning_rate": 1.9105422879007786e-05, + "loss": 0.0112, + "step": 6860 + }, + { + "epoch": 1.8102300441632062, + "grad_norm": 1.4515737295150757, + "learning_rate": 1.897347935083784e-05, + "loss": 0.0202, + "step": 6865 + }, + { + "epoch": 1.811548348823413, + "grad_norm": 0.016307830810546875, + "learning_rate": 1.88415358226679e-05, + "loss": 0.0148, + "step": 6870 + }, + { + "epoch": 1.8128666534836202, + "grad_norm": 0.0745878592133522, + "learning_rate": 1.8709592294497956e-05, + "loss": 0.0062, + "step": 6875 + }, + { + "epoch": 1.814184958143827, + "grad_norm": 0.02554013952612877, + "learning_rate": 1.8577648766328012e-05, + "loss": 0.003, + "step": 6880 + }, + { + "epoch": 1.815503262804034, + "grad_norm": 0.45748665928840637, + "learning_rate": 1.844570523815807e-05, + "loss": 0.0386, + "step": 6885 + }, + { + "epoch": 1.816821567464241, + "grad_norm": 0.013801589608192444, + "learning_rate": 1.8313761709988126e-05, + "loss": 0.0342, + "step": 6890 + }, + { + "epoch": 1.818139872124448, + "grad_norm": 0.6251696944236755, + "learning_rate": 1.8181818181818182e-05, + "loss": 0.0101, + "step": 6895 + }, + { + "epoch": 1.819458176784655, + "grad_norm": 0.28203102946281433, + "learning_rate": 1.8049874653648238e-05, + "loss": 0.0032, + "step": 6900 + }, + { + "epoch": 1.820776481444862, + "grad_norm": 0.28511062264442444, + "learning_rate": 1.7917931125478297e-05, + "loss": 0.0343, + "step": 6905 + }, + { + "epoch": 1.822094786105069, + "grad_norm": 0.004940215498209, + "learning_rate": 1.7785987597308352e-05, + "loss": 0.0265, + "step": 6910 + }, + { + "epoch": 1.8234130907652757, + "grad_norm": 0.002903093583881855, + "learning_rate": 1.7654044069138408e-05, + "loss": 0.0025, + "step": 6915 + }, + { + "epoch": 1.824731395425483, + "grad_norm": 0.008801674470305443, + "learning_rate": 1.7522100540968467e-05, + "loss": 0.0246, + "step": 6920 + }, + { + "epoch": 1.8260497000856897, + "grad_norm": 0.13823826611042023, + "learning_rate": 1.7390157012798523e-05, + "loss": 0.0058, + "step": 6925 + }, + { + "epoch": 1.827368004745897, + "grad_norm": 0.020868878811597824, + "learning_rate": 1.725821348462858e-05, + "loss": 0.0014, + "step": 6930 + }, + { + "epoch": 1.8286863094061037, + "grad_norm": 0.0027356524951756, + "learning_rate": 1.7126269956458638e-05, + "loss": 0.0035, + "step": 6935 + }, + { + "epoch": 1.8300046140663109, + "grad_norm": 0.06023023650050163, + "learning_rate": 1.6994326428288693e-05, + "loss": 0.0212, + "step": 6940 + }, + { + "epoch": 1.8313229187265176, + "grad_norm": 0.0009826788445934653, + "learning_rate": 1.686238290011875e-05, + "loss": 0.0034, + "step": 6945 + }, + { + "epoch": 1.8326412233867246, + "grad_norm": 0.2867647707462311, + "learning_rate": 1.6730439371948805e-05, + "loss": 0.0146, + "step": 6950 + }, + { + "epoch": 1.8339595280469316, + "grad_norm": 0.004501632414758205, + "learning_rate": 1.6598495843778864e-05, + "loss": 0.0026, + "step": 6955 + }, + { + "epoch": 1.8352778327071386, + "grad_norm": 0.01251616608351469, + "learning_rate": 1.6466552315608923e-05, + "loss": 0.0107, + "step": 6960 + }, + { + "epoch": 1.8365961373673456, + "grad_norm": 0.054781850427389145, + "learning_rate": 1.6334608787438975e-05, + "loss": 0.0044, + "step": 6965 + }, + { + "epoch": 1.8379144420275526, + "grad_norm": 0.1120501235127449, + "learning_rate": 1.6202665259269034e-05, + "loss": 0.0284, + "step": 6970 + }, + { + "epoch": 1.8392327466877596, + "grad_norm": 0.001668553682975471, + "learning_rate": 1.607072173109909e-05, + "loss": 0.0169, + "step": 6975 + }, + { + "epoch": 1.8405510513479664, + "grad_norm": 1.6374458074569702, + "learning_rate": 1.593877820292915e-05, + "loss": 0.031, + "step": 6980 + }, + { + "epoch": 1.8418693560081736, + "grad_norm": 0.012474550865590572, + "learning_rate": 1.5806834674759204e-05, + "loss": 0.0037, + "step": 6985 + }, + { + "epoch": 1.8431876606683804, + "grad_norm": 0.014898869208991528, + "learning_rate": 1.567489114658926e-05, + "loss": 0.003, + "step": 6990 + }, + { + "epoch": 1.8445059653285876, + "grad_norm": 0.035570453852415085, + "learning_rate": 1.554294761841932e-05, + "loss": 0.0038, + "step": 6995 + }, + { + "epoch": 1.8458242699887943, + "grad_norm": 0.9279152750968933, + "learning_rate": 1.541100409024937e-05, + "loss": 0.0235, + "step": 7000 + }, + { + "epoch": 1.8458242699887943, + "eval_loss": 0.022339830175042152, + "eval_runtime": 451.9068, + "eval_samples_per_second": 7.462, + "eval_steps_per_second": 3.731, + "step": 7000 + } + ], + "logging_steps": 5, + "max_steps": 7584, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.6496806486741606e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-7000/training_args.bin b/checkpoint-7000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4c05e0585b1aef03c0cbe4c50207ce04940bd838 --- /dev/null +++ b/checkpoint-7000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79dfa687fdd0c9908ab6b63535817e7567b29b0b483ac228723218f6f5fdeec5 +size 5688 diff --git a/checkpoint-7500/README.md b/checkpoint-7500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e738b5eda9883f4a45efd9d37b8b8357ba758456 --- /dev/null +++ b/checkpoint-7500/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/Llama-3.2-3B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-7500/adapter_config.json b/checkpoint-7500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..852d02d4dc1204f4332be8b3363041a3d3e3feb3 --- /dev/null +++ b/checkpoint-7500/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/Llama-3.2-3B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "gate_proj", + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-7500/adapter_model.safetensors b/checkpoint-7500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3dbf9b0d2d6f5cbbbea73d67bdcbea140c063d12 --- /dev/null +++ b/checkpoint-7500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eec5255201d5280a9cc4109c876af644b0be90558291966e9bf4bf6909dca91f +size 97307544 diff --git a/checkpoint-7500/optimizer.pt b/checkpoint-7500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a16e91899827a644338ea31ec8e93a0b0dbc4a58 --- /dev/null +++ b/checkpoint-7500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b47fbbe8f6504a74e331722ed55c613ea95c70fc41366bd3039b5f5f246fa08 +size 50866370 diff --git a/checkpoint-7500/rng_state.pth b/checkpoint-7500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6184fd4d1318a0aa9f43ef428e9eac008b905459 --- /dev/null +++ b/checkpoint-7500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a70fcdbdddd24f5a3236fc25540034ecb89a2d4e934b73da08b17b4d5ac00b3 +size 14244 diff --git a/checkpoint-7500/scheduler.pt b/checkpoint-7500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..238349b7ee74cb6522d967fd82a428814de86a85 --- /dev/null +++ b/checkpoint-7500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13aa9fc9c1cccae8736e3f98189b2587bc708b45707ec9e78d3c20ca08f71927 +size 1064 diff --git a/checkpoint-7500/special_tokens_map.json b/checkpoint-7500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-7500/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-7500/tokenizer.json b/checkpoint-7500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-7500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-7500/tokenizer_config.json b/checkpoint-7500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfd835b0ef1033e89629af6e13ae45397e9fa2f8 --- /dev/null +++ b/checkpoint-7500/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 July 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-7500/trainer_state.json b/checkpoint-7500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4ad0e2ef8b0c3533c2a14c1dee44f594a0620a62 --- /dev/null +++ b/checkpoint-7500/trainer_state.json @@ -0,0 +1,10656 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9776547360094918, + "eval_steps": 500, + "global_step": 7500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001318304660206974, + "grad_norm": 4.59375, + "learning_rate": 0.0002, + "loss": 1.9624, + "step": 5 + }, + { + "epoch": 0.002636609320413948, + "grad_norm": 1.7421875, + "learning_rate": 0.00019986805647183008, + "loss": 0.6513, + "step": 10 + }, + { + "epoch": 0.003954913980620921, + "grad_norm": 1.84375, + "learning_rate": 0.00019973611294366012, + "loss": 0.1146, + "step": 15 + }, + { + "epoch": 0.005273218640827896, + "grad_norm": 1.3203125, + "learning_rate": 0.0001996041694154902, + "loss": 0.0529, + "step": 20 + }, + { + "epoch": 0.006591523301034869, + "grad_norm": 0.40234375, + "learning_rate": 0.00019947222588732023, + "loss": 0.1214, + "step": 25 + }, + { + "epoch": 0.007909827961241843, + "grad_norm": 1.5390625, + "learning_rate": 0.0001993402823591503, + "loss": 0.0919, + "step": 30 + }, + { + "epoch": 0.009228132621448816, + "grad_norm": 0.06201171875, + "learning_rate": 0.00019920833883098034, + "loss": 0.09, + "step": 35 + }, + { + "epoch": 0.010546437281655791, + "grad_norm": 1.53125, + "learning_rate": 0.0001990763953028104, + "loss": 0.1945, + "step": 40 + }, + { + "epoch": 0.011864741941862765, + "grad_norm": 0.2890625, + "learning_rate": 0.00019894445177464048, + "loss": 0.1259, + "step": 45 + }, + { + "epoch": 0.013183046602069738, + "grad_norm": 0.609375, + "learning_rate": 0.00019881250824647052, + "loss": 0.027, + "step": 50 + }, + { + "epoch": 0.014501351262276712, + "grad_norm": 0.369140625, + "learning_rate": 0.00019868056471830057, + "loss": 0.1068, + "step": 55 + }, + { + "epoch": 0.015819655922483685, + "grad_norm": 0.34765625, + "learning_rate": 0.00019854862119013064, + "loss": 0.0542, + "step": 60 + }, + { + "epoch": 0.01713796058269066, + "grad_norm": 0.055419921875, + "learning_rate": 0.00019841667766196068, + "loss": 0.0901, + "step": 65 + }, + { + "epoch": 0.018456265242897632, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00019828473413379075, + "loss": 0.0091, + "step": 70 + }, + { + "epoch": 0.019774569903104607, + "grad_norm": 0.0079345703125, + "learning_rate": 0.0001981527906056208, + "loss": 0.0744, + "step": 75 + }, + { + "epoch": 0.021092874563311582, + "grad_norm": 0.65234375, + "learning_rate": 0.00019802084707745086, + "loss": 0.1108, + "step": 80 + }, + { + "epoch": 0.022411179223518554, + "grad_norm": 0.50390625, + "learning_rate": 0.0001978889035492809, + "loss": 0.0446, + "step": 85 + }, + { + "epoch": 0.02372948388372553, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019775696002111097, + "loss": 0.0982, + "step": 90 + }, + { + "epoch": 0.0250477885439325, + "grad_norm": 0.490234375, + "learning_rate": 0.00019762501649294104, + "loss": 0.1035, + "step": 95 + }, + { + "epoch": 0.026366093204139476, + "grad_norm": 0.12158203125, + "learning_rate": 0.00019749307296477108, + "loss": 0.0401, + "step": 100 + }, + { + "epoch": 0.02768439786434645, + "grad_norm": 0.16015625, + "learning_rate": 0.00019736112943660115, + "loss": 0.0309, + "step": 105 + }, + { + "epoch": 0.029002702524553423, + "grad_norm": 1.359375, + "learning_rate": 0.0001972291859084312, + "loss": 0.1032, + "step": 110 + }, + { + "epoch": 0.0303210071847604, + "grad_norm": 0.52734375, + "learning_rate": 0.00019709724238026126, + "loss": 0.0811, + "step": 115 + }, + { + "epoch": 0.03163931184496737, + "grad_norm": 0.177734375, + "learning_rate": 0.00019696529885209133, + "loss": 0.0258, + "step": 120 + }, + { + "epoch": 0.03295761650517435, + "grad_norm": 0.234375, + "learning_rate": 0.00019683335532392137, + "loss": 0.0437, + "step": 125 + }, + { + "epoch": 0.03427592116538132, + "grad_norm": 1.3046875, + "learning_rate": 0.00019670141179575144, + "loss": 0.0967, + "step": 130 + }, + { + "epoch": 0.03559422582558829, + "grad_norm": 0.2734375, + "learning_rate": 0.00019656946826758148, + "loss": 0.0132, + "step": 135 + }, + { + "epoch": 0.036912530485795264, + "grad_norm": 0.66015625, + "learning_rate": 0.00019643752473941155, + "loss": 0.0396, + "step": 140 + }, + { + "epoch": 0.03823083514600224, + "grad_norm": 1.0546875, + "learning_rate": 0.0001963055812112416, + "loss": 0.0449, + "step": 145 + }, + { + "epoch": 0.039549139806209214, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019617363768307166, + "loss": 0.1196, + "step": 150 + }, + { + "epoch": 0.040867444466416186, + "grad_norm": 0.5859375, + "learning_rate": 0.0001960416941549017, + "loss": 0.0588, + "step": 155 + }, + { + "epoch": 0.042185749126623165, + "grad_norm": 0.06005859375, + "learning_rate": 0.00019590975062673175, + "loss": 0.0234, + "step": 160 + }, + { + "epoch": 0.04350405378683014, + "grad_norm": 0.4921875, + "learning_rate": 0.00019577780709856182, + "loss": 0.0916, + "step": 165 + }, + { + "epoch": 0.04482235844703711, + "grad_norm": 0.84375, + "learning_rate": 0.0001956458635703919, + "loss": 0.0271, + "step": 170 + }, + { + "epoch": 0.04614066310724409, + "grad_norm": 0.8828125, + "learning_rate": 0.00019551392004222193, + "loss": 0.0175, + "step": 175 + }, + { + "epoch": 0.04745896776745106, + "grad_norm": 0.0152587890625, + "learning_rate": 0.000195381976514052, + "loss": 0.0356, + "step": 180 + }, + { + "epoch": 0.04877727242765803, + "grad_norm": 0.09326171875, + "learning_rate": 0.00019525003298588204, + "loss": 0.0057, + "step": 185 + }, + { + "epoch": 0.050095577087865, + "grad_norm": 0.24609375, + "learning_rate": 0.0001951180894577121, + "loss": 0.0082, + "step": 190 + }, + { + "epoch": 0.05141388174807198, + "grad_norm": 0.05029296875, + "learning_rate": 0.00019498614592954215, + "loss": 0.0178, + "step": 195 + }, + { + "epoch": 0.05273218640827895, + "grad_norm": 0.0390625, + "learning_rate": 0.00019485420240137222, + "loss": 0.0789, + "step": 200 + }, + { + "epoch": 0.054050491068485924, + "grad_norm": 0.5625, + "learning_rate": 0.0001947222588732023, + "loss": 0.0645, + "step": 205 + }, + { + "epoch": 0.0553687957286929, + "grad_norm": 0.53515625, + "learning_rate": 0.00019459031534503233, + "loss": 0.116, + "step": 210 + }, + { + "epoch": 0.056687100388899875, + "grad_norm": 0.55078125, + "learning_rate": 0.0001944583718168624, + "loss": 0.0516, + "step": 215 + }, + { + "epoch": 0.058005405049106847, + "grad_norm": 0.314453125, + "learning_rate": 0.00019432642828869244, + "loss": 0.1019, + "step": 220 + }, + { + "epoch": 0.059323709709313825, + "grad_norm": 0.1123046875, + "learning_rate": 0.0001941944847605225, + "loss": 0.0529, + "step": 225 + }, + { + "epoch": 0.0606420143695208, + "grad_norm": 0.4921875, + "learning_rate": 0.00019406254123235256, + "loss": 0.0368, + "step": 230 + }, + { + "epoch": 0.06196031902972777, + "grad_norm": 0.054443359375, + "learning_rate": 0.00019393059770418262, + "loss": 0.037, + "step": 235 + }, + { + "epoch": 0.06327862368993474, + "grad_norm": 0.008544921875, + "learning_rate": 0.0001937986541760127, + "loss": 0.0324, + "step": 240 + }, + { + "epoch": 0.06459692835014172, + "grad_norm": 1.5, + "learning_rate": 0.00019366671064784274, + "loss": 0.0334, + "step": 245 + }, + { + "epoch": 0.0659152330103487, + "grad_norm": 0.2109375, + "learning_rate": 0.0001935347671196728, + "loss": 0.0671, + "step": 250 + }, + { + "epoch": 0.06723353767055566, + "grad_norm": 2.0625, + "learning_rate": 0.00019340282359150285, + "loss": 0.1559, + "step": 255 + }, + { + "epoch": 0.06855184233076264, + "grad_norm": 0.7734375, + "learning_rate": 0.0001932708800633329, + "loss": 0.0198, + "step": 260 + }, + { + "epoch": 0.06987014699096962, + "grad_norm": 0.42578125, + "learning_rate": 0.00019313893653516296, + "loss": 0.0151, + "step": 265 + }, + { + "epoch": 0.07118845165117658, + "grad_norm": 0.1884765625, + "learning_rate": 0.000193006993006993, + "loss": 0.0269, + "step": 270 + }, + { + "epoch": 0.07250675631138356, + "grad_norm": 1.546875, + "learning_rate": 0.00019287504947882307, + "loss": 0.0565, + "step": 275 + }, + { + "epoch": 0.07382506097159053, + "grad_norm": 0.5078125, + "learning_rate": 0.0001927431059506531, + "loss": 0.0942, + "step": 280 + }, + { + "epoch": 0.0751433656317975, + "grad_norm": 0.392578125, + "learning_rate": 0.00019261116242248318, + "loss": 0.0061, + "step": 285 + }, + { + "epoch": 0.07646167029200449, + "grad_norm": 1.9140625, + "learning_rate": 0.00019247921889431325, + "loss": 0.0497, + "step": 290 + }, + { + "epoch": 0.07777997495221145, + "grad_norm": 0.08837890625, + "learning_rate": 0.0001923472753661433, + "loss": 0.0573, + "step": 295 + }, + { + "epoch": 0.07909827961241843, + "grad_norm": 1.046875, + "learning_rate": 0.00019221533183797336, + "loss": 0.0528, + "step": 300 + }, + { + "epoch": 0.08041658427262541, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001920833883098034, + "loss": 0.0506, + "step": 305 + }, + { + "epoch": 0.08173488893283237, + "grad_norm": 0.08203125, + "learning_rate": 0.00019195144478163347, + "loss": 0.0307, + "step": 310 + }, + { + "epoch": 0.08305319359303935, + "grad_norm": 0.111328125, + "learning_rate": 0.00019181950125346354, + "loss": 0.0365, + "step": 315 + }, + { + "epoch": 0.08437149825324633, + "grad_norm": 1.2890625, + "learning_rate": 0.00019168755772529358, + "loss": 0.0447, + "step": 320 + }, + { + "epoch": 0.0856898029134533, + "grad_norm": 0.6015625, + "learning_rate": 0.00019155561419712365, + "loss": 0.0605, + "step": 325 + }, + { + "epoch": 0.08700810757366027, + "grad_norm": 0.71875, + "learning_rate": 0.0001914236706689537, + "loss": 0.0846, + "step": 330 + }, + { + "epoch": 0.08832641223386725, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019129172714078376, + "loss": 0.0713, + "step": 335 + }, + { + "epoch": 0.08964471689407422, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001911597836126138, + "loss": 0.0826, + "step": 340 + }, + { + "epoch": 0.0909630215542812, + "grad_norm": 2.203125, + "learning_rate": 0.00019102784008444388, + "loss": 0.0441, + "step": 345 + }, + { + "epoch": 0.09228132621448817, + "grad_norm": 1.21875, + "learning_rate": 0.00019089589655627395, + "loss": 0.1378, + "step": 350 + }, + { + "epoch": 0.09359963087469514, + "grad_norm": 3.0625, + "learning_rate": 0.00019076395302810396, + "loss": 0.1552, + "step": 355 + }, + { + "epoch": 0.09491793553490212, + "grad_norm": 0.232421875, + "learning_rate": 0.00019063200949993403, + "loss": 0.0458, + "step": 360 + }, + { + "epoch": 0.0962362401951091, + "grad_norm": 0.71875, + "learning_rate": 0.0001905000659717641, + "loss": 0.0312, + "step": 365 + }, + { + "epoch": 0.09755454485531606, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00019036812244359414, + "loss": 0.0247, + "step": 370 + }, + { + "epoch": 0.09887284951552304, + "grad_norm": 0.064453125, + "learning_rate": 0.0001902361789154242, + "loss": 0.054, + "step": 375 + }, + { + "epoch": 0.10019115417573, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019010423538725425, + "loss": 0.0023, + "step": 380 + }, + { + "epoch": 0.10150945883593698, + "grad_norm": 0.0361328125, + "learning_rate": 0.00018997229185908432, + "loss": 0.0884, + "step": 385 + }, + { + "epoch": 0.10282776349614396, + "grad_norm": 1.703125, + "learning_rate": 0.00018984034833091436, + "loss": 0.0506, + "step": 390 + }, + { + "epoch": 0.10414606815635093, + "grad_norm": 0.08837890625, + "learning_rate": 0.00018970840480274443, + "loss": 0.1123, + "step": 395 + }, + { + "epoch": 0.1054643728165579, + "grad_norm": 0.6953125, + "learning_rate": 0.0001895764612745745, + "loss": 0.0597, + "step": 400 + }, + { + "epoch": 0.10678267747676488, + "grad_norm": 0.18359375, + "learning_rate": 0.00018944451774640454, + "loss": 0.0138, + "step": 405 + }, + { + "epoch": 0.10810098213697185, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001893125742182346, + "loss": 0.0249, + "step": 410 + }, + { + "epoch": 0.10941928679717883, + "grad_norm": 0.00970458984375, + "learning_rate": 0.00018918063069006466, + "loss": 0.0084, + "step": 415 + }, + { + "epoch": 0.1107375914573858, + "grad_norm": 0.54296875, + "learning_rate": 0.00018904868716189472, + "loss": 0.0541, + "step": 420 + }, + { + "epoch": 0.11205589611759277, + "grad_norm": 0.74609375, + "learning_rate": 0.00018891674363372477, + "loss": 0.007, + "step": 425 + }, + { + "epoch": 0.11337420077779975, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00018878480010555484, + "loss": 0.0875, + "step": 430 + }, + { + "epoch": 0.11469250543800673, + "grad_norm": 0.9296875, + "learning_rate": 0.0001886528565773849, + "loss": 0.1207, + "step": 435 + }, + { + "epoch": 0.11601081009821369, + "grad_norm": 1.2734375, + "learning_rate": 0.00018852091304921495, + "loss": 0.1143, + "step": 440 + }, + { + "epoch": 0.11732911475842067, + "grad_norm": 0.6484375, + "learning_rate": 0.00018838896952104502, + "loss": 0.0393, + "step": 445 + }, + { + "epoch": 0.11864741941862765, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018825702599287506, + "loss": 0.02, + "step": 450 + }, + { + "epoch": 0.11996572407883462, + "grad_norm": 0.486328125, + "learning_rate": 0.0001881250824647051, + "loss": 0.0891, + "step": 455 + }, + { + "epoch": 0.1212840287390416, + "grad_norm": 1.0, + "learning_rate": 0.00018799313893653517, + "loss": 0.0469, + "step": 460 + }, + { + "epoch": 0.12260233339924857, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001878611954083652, + "loss": 0.019, + "step": 465 + }, + { + "epoch": 0.12392063805945554, + "grad_norm": 0.03857421875, + "learning_rate": 0.00018772925188019528, + "loss": 0.007, + "step": 470 + }, + { + "epoch": 0.12523894271966252, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00018759730835202532, + "loss": 0.0039, + "step": 475 + }, + { + "epoch": 0.12655724737986948, + "grad_norm": 0.014404296875, + "learning_rate": 0.0001874653648238554, + "loss": 0.0043, + "step": 480 + }, + { + "epoch": 0.12787555204007647, + "grad_norm": 0.51953125, + "learning_rate": 0.00018733342129568546, + "loss": 0.1326, + "step": 485 + }, + { + "epoch": 0.12919385670028344, + "grad_norm": 0.99609375, + "learning_rate": 0.0001872014777675155, + "loss": 0.0369, + "step": 490 + }, + { + "epoch": 0.1305121613604904, + "grad_norm": 0.2734375, + "learning_rate": 0.00018706953423934557, + "loss": 0.0395, + "step": 495 + }, + { + "epoch": 0.1318304660206974, + "grad_norm": 0.083984375, + "learning_rate": 0.00018693759071117561, + "loss": 0.0284, + "step": 500 + }, + { + "epoch": 0.1318304660206974, + "eval_loss": 0.04542969539761543, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.5293, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 3.685, + "step": 500 + }, + { + "epoch": 0.13314877068090436, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00018680564718300568, + "loss": 0.0533, + "step": 505 + }, + { + "epoch": 0.13446707534111133, + "grad_norm": 0.71484375, + "learning_rate": 0.00018667370365483575, + "loss": 0.0183, + "step": 510 + }, + { + "epoch": 0.13578538000131832, + "grad_norm": 0.018798828125, + "learning_rate": 0.0001865417601266658, + "loss": 0.0473, + "step": 515 + }, + { + "epoch": 0.13710368466152528, + "grad_norm": 0.388671875, + "learning_rate": 0.00018640981659849586, + "loss": 0.0562, + "step": 520 + }, + { + "epoch": 0.13842198932173225, + "grad_norm": 0.77734375, + "learning_rate": 0.0001862778730703259, + "loss": 0.0755, + "step": 525 + }, + { + "epoch": 0.13974029398193924, + "grad_norm": 2.8125, + "learning_rate": 0.00018614592954215598, + "loss": 0.0422, + "step": 530 + }, + { + "epoch": 0.1410585986421462, + "grad_norm": 0.48828125, + "learning_rate": 0.00018601398601398602, + "loss": 0.0882, + "step": 535 + }, + { + "epoch": 0.14237690330235317, + "grad_norm": 0.16015625, + "learning_rate": 0.0001858820424858161, + "loss": 0.0131, + "step": 540 + }, + { + "epoch": 0.14369520796256013, + "grad_norm": 0.31640625, + "learning_rate": 0.00018575009895764616, + "loss": 0.03, + "step": 545 + }, + { + "epoch": 0.14501351262276713, + "grad_norm": 0.0120849609375, + "learning_rate": 0.0001856181554294762, + "loss": 0.0425, + "step": 550 + }, + { + "epoch": 0.1463318172829741, + "grad_norm": 0.390625, + "learning_rate": 0.00018548621190130624, + "loss": 0.011, + "step": 555 + }, + { + "epoch": 0.14765012194318106, + "grad_norm": 1.9609375, + "learning_rate": 0.0001853542683731363, + "loss": 0.0807, + "step": 560 + }, + { + "epoch": 0.14896842660338805, + "grad_norm": 0.609375, + "learning_rate": 0.00018522232484496635, + "loss": 0.0278, + "step": 565 + }, + { + "epoch": 0.150286731263595, + "grad_norm": 0.087890625, + "learning_rate": 0.00018509038131679642, + "loss": 0.0484, + "step": 570 + }, + { + "epoch": 0.15160503592380198, + "grad_norm": 0.5078125, + "learning_rate": 0.00018495843778862646, + "loss": 0.1277, + "step": 575 + }, + { + "epoch": 0.15292334058400897, + "grad_norm": 0.8125, + "learning_rate": 0.00018482649426045653, + "loss": 0.058, + "step": 580 + }, + { + "epoch": 0.15424164524421594, + "grad_norm": 0.22265625, + "learning_rate": 0.00018469455073228657, + "loss": 0.0259, + "step": 585 + }, + { + "epoch": 0.1555599499044229, + "grad_norm": 1.8984375, + "learning_rate": 0.00018456260720411664, + "loss": 0.113, + "step": 590 + }, + { + "epoch": 0.1568782545646299, + "grad_norm": 0.12451171875, + "learning_rate": 0.0001844306636759467, + "loss": 0.0312, + "step": 595 + }, + { + "epoch": 0.15819655922483686, + "grad_norm": 0.0322265625, + "learning_rate": 0.00018429872014777676, + "loss": 0.0476, + "step": 600 + }, + { + "epoch": 0.15951486388504382, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00018416677661960682, + "loss": 0.0232, + "step": 605 + }, + { + "epoch": 0.16083316854525082, + "grad_norm": 0.57421875, + "learning_rate": 0.00018403483309143687, + "loss": 0.1287, + "step": 610 + }, + { + "epoch": 0.16215147320545778, + "grad_norm": 0.765625, + "learning_rate": 0.00018390288956326694, + "loss": 0.0991, + "step": 615 + }, + { + "epoch": 0.16346977786566474, + "grad_norm": 0.3125, + "learning_rate": 0.00018377094603509698, + "loss": 0.0247, + "step": 620 + }, + { + "epoch": 0.16478808252587174, + "grad_norm": 0.37890625, + "learning_rate": 0.00018363900250692705, + "loss": 0.0632, + "step": 625 + }, + { + "epoch": 0.1661063871860787, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018350705897875712, + "loss": 0.0314, + "step": 630 + }, + { + "epoch": 0.16742469184628567, + "grad_norm": 0.0673828125, + "learning_rate": 0.00018337511545058716, + "loss": 0.0425, + "step": 635 + }, + { + "epoch": 0.16874299650649266, + "grad_norm": 0.396484375, + "learning_rate": 0.00018324317192241723, + "loss": 0.0613, + "step": 640 + }, + { + "epoch": 0.17006130116669962, + "grad_norm": 0.057373046875, + "learning_rate": 0.00018311122839424727, + "loss": 0.0569, + "step": 645 + }, + { + "epoch": 0.1713796058269066, + "grad_norm": 0.001373291015625, + "learning_rate": 0.00018297928486607734, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 0.17269791048711358, + "grad_norm": 1.0859375, + "learning_rate": 0.00018284734133790738, + "loss": 0.0189, + "step": 655 + }, + { + "epoch": 0.17401621514732055, + "grad_norm": 0.6015625, + "learning_rate": 0.00018271539780973742, + "loss": 0.0601, + "step": 660 + }, + { + "epoch": 0.1753345198075275, + "grad_norm": 0.25390625, + "learning_rate": 0.0001825834542815675, + "loss": 0.0211, + "step": 665 + }, + { + "epoch": 0.1766528244677345, + "grad_norm": 2.6875, + "learning_rate": 0.00018245151075339753, + "loss": 0.0713, + "step": 670 + }, + { + "epoch": 0.17797112912794147, + "grad_norm": 1.1875, + "learning_rate": 0.0001823195672252276, + "loss": 0.0522, + "step": 675 + }, + { + "epoch": 0.17928943378814843, + "grad_norm": 0.025146484375, + "learning_rate": 0.00018218762369705767, + "loss": 0.0242, + "step": 680 + }, + { + "epoch": 0.18060773844835543, + "grad_norm": 0.048095703125, + "learning_rate": 0.00018205568016888772, + "loss": 0.0129, + "step": 685 + }, + { + "epoch": 0.1819260431085624, + "grad_norm": 0.04541015625, + "learning_rate": 0.00018192373664071778, + "loss": 0.0142, + "step": 690 + }, + { + "epoch": 0.18324434776876936, + "grad_norm": 0.00830078125, + "learning_rate": 0.00018179179311254783, + "loss": 0.0121, + "step": 695 + }, + { + "epoch": 0.18456265242897635, + "grad_norm": 0.53125, + "learning_rate": 0.0001816598495843779, + "loss": 0.0163, + "step": 700 + }, + { + "epoch": 0.1858809570891833, + "grad_norm": 0.185546875, + "learning_rate": 0.00018152790605620796, + "loss": 0.0203, + "step": 705 + }, + { + "epoch": 0.18719926174939028, + "grad_norm": 1.2578125, + "learning_rate": 0.000181395962528038, + "loss": 0.1548, + "step": 710 + }, + { + "epoch": 0.18851756640959727, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00018126401899986808, + "loss": 0.0543, + "step": 715 + }, + { + "epoch": 0.18983587106980424, + "grad_norm": 0.07568359375, + "learning_rate": 0.00018113207547169812, + "loss": 0.0346, + "step": 720 + }, + { + "epoch": 0.1911541757300112, + "grad_norm": 0.1318359375, + "learning_rate": 0.0001810001319435282, + "loss": 0.03, + "step": 725 + }, + { + "epoch": 0.1924724803902182, + "grad_norm": 0.1455078125, + "learning_rate": 0.00018086818841535823, + "loss": 0.0796, + "step": 730 + }, + { + "epoch": 0.19379078505042516, + "grad_norm": 0.09814453125, + "learning_rate": 0.0001807362448871883, + "loss": 0.0662, + "step": 735 + }, + { + "epoch": 0.19510908971063212, + "grad_norm": 0.91015625, + "learning_rate": 0.00018060430135901837, + "loss": 0.0675, + "step": 740 + }, + { + "epoch": 0.19642739437083911, + "grad_norm": 0.10693359375, + "learning_rate": 0.0001804723578308484, + "loss": 0.0377, + "step": 745 + }, + { + "epoch": 0.19774569903104608, + "grad_norm": 0.95703125, + "learning_rate": 0.00018034041430267848, + "loss": 0.0174, + "step": 750 + }, + { + "epoch": 0.19906400369125304, + "grad_norm": 1.7890625, + "learning_rate": 0.00018020847077450852, + "loss": 0.0278, + "step": 755 + }, + { + "epoch": 0.20038230835146, + "grad_norm": 0.8515625, + "learning_rate": 0.00018007652724633856, + "loss": 0.0113, + "step": 760 + }, + { + "epoch": 0.201700613011667, + "grad_norm": 0.016845703125, + "learning_rate": 0.00017994458371816863, + "loss": 0.0589, + "step": 765 + }, + { + "epoch": 0.20301891767187397, + "grad_norm": 0.01043701171875, + "learning_rate": 0.00017981264018999867, + "loss": 0.0203, + "step": 770 + }, + { + "epoch": 0.20433722233208093, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00017968069666182874, + "loss": 0.0494, + "step": 775 + }, + { + "epoch": 0.20565552699228792, + "grad_norm": 0.56640625, + "learning_rate": 0.00017954875313365879, + "loss": 0.0394, + "step": 780 + }, + { + "epoch": 0.2069738316524949, + "grad_norm": 0.06591796875, + "learning_rate": 0.00017941680960548886, + "loss": 0.0848, + "step": 785 + }, + { + "epoch": 0.20829213631270185, + "grad_norm": 0.40234375, + "learning_rate": 0.00017928486607731892, + "loss": 0.0464, + "step": 790 + }, + { + "epoch": 0.20961044097290885, + "grad_norm": 0.06298828125, + "learning_rate": 0.00017915292254914897, + "loss": 0.0222, + "step": 795 + }, + { + "epoch": 0.2109287456331158, + "grad_norm": 0.5390625, + "learning_rate": 0.00017902097902097904, + "loss": 0.0434, + "step": 800 + }, + { + "epoch": 0.21224705029332278, + "grad_norm": 1.390625, + "learning_rate": 0.00017888903549280908, + "loss": 0.0222, + "step": 805 + }, + { + "epoch": 0.21356535495352977, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00017875709196463915, + "loss": 0.0099, + "step": 810 + }, + { + "epoch": 0.21488365961373673, + "grad_norm": 0.10009765625, + "learning_rate": 0.0001786251484364692, + "loss": 0.0086, + "step": 815 + }, + { + "epoch": 0.2162019642739437, + "grad_norm": 0.06396484375, + "learning_rate": 0.00017849320490829926, + "loss": 0.0715, + "step": 820 + }, + { + "epoch": 0.2175202689341507, + "grad_norm": 0.365234375, + "learning_rate": 0.00017836126138012933, + "loss": 0.0642, + "step": 825 + }, + { + "epoch": 0.21883857359435765, + "grad_norm": 0.01519775390625, + "learning_rate": 0.00017822931785195937, + "loss": 0.0111, + "step": 830 + }, + { + "epoch": 0.22015687825456462, + "grad_norm": 1.1640625, + "learning_rate": 0.00017809737432378944, + "loss": 0.0518, + "step": 835 + }, + { + "epoch": 0.2214751829147716, + "grad_norm": 0.00921630859375, + "learning_rate": 0.00017796543079561948, + "loss": 0.0384, + "step": 840 + }, + { + "epoch": 0.22279348757497858, + "grad_norm": 0.33984375, + "learning_rate": 0.00017783348726744955, + "loss": 0.0204, + "step": 845 + }, + { + "epoch": 0.22411179223518554, + "grad_norm": 0.294921875, + "learning_rate": 0.00017770154373927962, + "loss": 0.0075, + "step": 850 + }, + { + "epoch": 0.22543009689539253, + "grad_norm": 0.033203125, + "learning_rate": 0.00017756960021110963, + "loss": 0.0895, + "step": 855 + }, + { + "epoch": 0.2267484015555995, + "grad_norm": 0.08056640625, + "learning_rate": 0.0001774376566829397, + "loss": 0.1039, + "step": 860 + }, + { + "epoch": 0.22806670621580646, + "grad_norm": 0.55078125, + "learning_rate": 0.00017730571315476975, + "loss": 0.0125, + "step": 865 + }, + { + "epoch": 0.22938501087601346, + "grad_norm": 0.5859375, + "learning_rate": 0.00017717376962659982, + "loss": 0.0381, + "step": 870 + }, + { + "epoch": 0.23070331553622042, + "grad_norm": 0.029052734375, + "learning_rate": 0.00017704182609842988, + "loss": 0.0434, + "step": 875 + }, + { + "epoch": 0.23202162019642739, + "grad_norm": 0.43359375, + "learning_rate": 0.00017690988257025993, + "loss": 0.0799, + "step": 880 + }, + { + "epoch": 0.23333992485663438, + "grad_norm": 0.04150390625, + "learning_rate": 0.00017677793904209, + "loss": 0.0692, + "step": 885 + }, + { + "epoch": 0.23465822951684134, + "grad_norm": 0.435546875, + "learning_rate": 0.00017664599551392004, + "loss": 0.0544, + "step": 890 + }, + { + "epoch": 0.2359765341770483, + "grad_norm": 1.171875, + "learning_rate": 0.0001765140519857501, + "loss": 0.0619, + "step": 895 + }, + { + "epoch": 0.2372948388372553, + "grad_norm": 0.01263427734375, + "learning_rate": 0.00017638210845758018, + "loss": 0.0418, + "step": 900 + }, + { + "epoch": 0.23861314349746227, + "grad_norm": 0.017578125, + "learning_rate": 0.00017625016492941022, + "loss": 0.0195, + "step": 905 + }, + { + "epoch": 0.23993144815766923, + "grad_norm": 0.6171875, + "learning_rate": 0.0001761182214012403, + "loss": 0.067, + "step": 910 + }, + { + "epoch": 0.24124975281787622, + "grad_norm": 0.59765625, + "learning_rate": 0.00017598627787307033, + "loss": 0.049, + "step": 915 + }, + { + "epoch": 0.2425680574780832, + "grad_norm": 1.2421875, + "learning_rate": 0.0001758543343449004, + "loss": 0.0539, + "step": 920 + }, + { + "epoch": 0.24388636213829015, + "grad_norm": 0.10302734375, + "learning_rate": 0.00017572239081673044, + "loss": 0.0725, + "step": 925 + }, + { + "epoch": 0.24520466679849715, + "grad_norm": 0.330078125, + "learning_rate": 0.0001755904472885605, + "loss": 0.064, + "step": 930 + }, + { + "epoch": 0.2465229714587041, + "grad_norm": 0.220703125, + "learning_rate": 0.00017545850376039058, + "loss": 0.0271, + "step": 935 + }, + { + "epoch": 0.24784127611891107, + "grad_norm": 0.01470947265625, + "learning_rate": 0.00017532656023222062, + "loss": 0.0247, + "step": 940 + }, + { + "epoch": 0.24915958077911807, + "grad_norm": 0.013427734375, + "learning_rate": 0.0001751946167040507, + "loss": 0.017, + "step": 945 + }, + { + "epoch": 0.25047788543932503, + "grad_norm": 0.58984375, + "learning_rate": 0.00017506267317588073, + "loss": 0.0254, + "step": 950 + }, + { + "epoch": 0.251796190099532, + "grad_norm": 0.412109375, + "learning_rate": 0.00017493072964771078, + "loss": 0.0186, + "step": 955 + }, + { + "epoch": 0.25311449475973896, + "grad_norm": 0.66796875, + "learning_rate": 0.00017479878611954084, + "loss": 0.0617, + "step": 960 + }, + { + "epoch": 0.25443279941994595, + "grad_norm": 0.322265625, + "learning_rate": 0.00017466684259137089, + "loss": 0.0173, + "step": 965 + }, + { + "epoch": 0.25575110408015295, + "grad_norm": 0.83203125, + "learning_rate": 0.00017453489906320096, + "loss": 0.0512, + "step": 970 + }, + { + "epoch": 0.2570694087403599, + "grad_norm": 0.08447265625, + "learning_rate": 0.000174402955535031, + "loss": 0.0361, + "step": 975 + }, + { + "epoch": 0.2583877134005669, + "grad_norm": 0.423828125, + "learning_rate": 0.00017427101200686107, + "loss": 0.0175, + "step": 980 + }, + { + "epoch": 0.25970601806077387, + "grad_norm": 0.77734375, + "learning_rate": 0.00017413906847869114, + "loss": 0.0139, + "step": 985 + }, + { + "epoch": 0.2610243227209808, + "grad_norm": 0.515625, + "learning_rate": 0.00017400712495052118, + "loss": 0.0948, + "step": 990 + }, + { + "epoch": 0.2623426273811878, + "grad_norm": 1.421875, + "learning_rate": 0.00017387518142235125, + "loss": 0.0406, + "step": 995 + }, + { + "epoch": 0.2636609320413948, + "grad_norm": 0.058837890625, + "learning_rate": 0.0001737432378941813, + "loss": 0.1011, + "step": 1000 + }, + { + "epoch": 0.2636609320413948, + "eval_loss": 0.045552924275398254, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.6113, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 3.684, + "step": 1000 + }, + { + "epoch": 0.26497923670160173, + "grad_norm": 0.380859375, + "learning_rate": 0.00017361129436601136, + "loss": 0.0711, + "step": 1005 + }, + { + "epoch": 0.2662975413618087, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00017347935083784143, + "loss": 0.0218, + "step": 1010 + }, + { + "epoch": 0.2676158460220157, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017334740730967147, + "loss": 0.0301, + "step": 1015 + }, + { + "epoch": 0.26893415068222265, + "grad_norm": 0.2734375, + "learning_rate": 0.00017321546378150154, + "loss": 0.0721, + "step": 1020 + }, + { + "epoch": 0.27025245534242964, + "grad_norm": 0.25390625, + "learning_rate": 0.00017308352025333158, + "loss": 0.0363, + "step": 1025 + }, + { + "epoch": 0.27157076000263664, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017295157672516165, + "loss": 0.0313, + "step": 1030 + }, + { + "epoch": 0.2728890646628436, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0001728196331969917, + "loss": 0.0385, + "step": 1035 + }, + { + "epoch": 0.27420736932305056, + "grad_norm": 0.00787353515625, + "learning_rate": 0.00017268768966882176, + "loss": 0.0405, + "step": 1040 + }, + { + "epoch": 0.27552567398325756, + "grad_norm": 0.484375, + "learning_rate": 0.00017255574614065183, + "loss": 0.0616, + "step": 1045 + }, + { + "epoch": 0.2768439786434645, + "grad_norm": 0.0908203125, + "learning_rate": 0.00017242380261248185, + "loss": 0.0057, + "step": 1050 + }, + { + "epoch": 0.2781622833036715, + "grad_norm": 0.1904296875, + "learning_rate": 0.00017229185908431192, + "loss": 0.0417, + "step": 1055 + }, + { + "epoch": 0.2794805879638785, + "grad_norm": 0.30078125, + "learning_rate": 0.00017215991555614196, + "loss": 0.0346, + "step": 1060 + }, + { + "epoch": 0.2807988926240854, + "grad_norm": 0.016357421875, + "learning_rate": 0.00017202797202797203, + "loss": 0.0295, + "step": 1065 + }, + { + "epoch": 0.2821171972842924, + "grad_norm": 0.490234375, + "learning_rate": 0.0001718960284998021, + "loss": 0.0448, + "step": 1070 + }, + { + "epoch": 0.28343550194449935, + "grad_norm": 0.004241943359375, + "learning_rate": 0.00017176408497163214, + "loss": 0.0051, + "step": 1075 + }, + { + "epoch": 0.28475380660470634, + "grad_norm": 0.01904296875, + "learning_rate": 0.0001716321414434622, + "loss": 0.0894, + "step": 1080 + }, + { + "epoch": 0.28607211126491333, + "grad_norm": 0.83984375, + "learning_rate": 0.00017150019791529225, + "loss": 0.0288, + "step": 1085 + }, + { + "epoch": 0.28739041592512027, + "grad_norm": 0.2021484375, + "learning_rate": 0.00017136825438712232, + "loss": 0.0222, + "step": 1090 + }, + { + "epoch": 0.28870872058532726, + "grad_norm": 0.322265625, + "learning_rate": 0.0001712363108589524, + "loss": 0.0444, + "step": 1095 + }, + { + "epoch": 0.29002702524553425, + "grad_norm": 0.408203125, + "learning_rate": 0.00017110436733078243, + "loss": 0.0828, + "step": 1100 + }, + { + "epoch": 0.2913453299057412, + "grad_norm": 0.04052734375, + "learning_rate": 0.0001709724238026125, + "loss": 0.0725, + "step": 1105 + }, + { + "epoch": 0.2926636345659482, + "grad_norm": 0.2578125, + "learning_rate": 0.00017084048027444254, + "loss": 0.0204, + "step": 1110 + }, + { + "epoch": 0.2939819392261552, + "grad_norm": 0.67578125, + "learning_rate": 0.0001707085367462726, + "loss": 0.0503, + "step": 1115 + }, + { + "epoch": 0.2953002438863621, + "grad_norm": 0.0059814453125, + "learning_rate": 0.00017057659321810265, + "loss": 0.0144, + "step": 1120 + }, + { + "epoch": 0.2966185485465691, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00017044464968993272, + "loss": 0.0044, + "step": 1125 + }, + { + "epoch": 0.2979368532067761, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001703127061617628, + "loss": 0.013, + "step": 1130 + }, + { + "epoch": 0.29925515786698303, + "grad_norm": 0.287109375, + "learning_rate": 0.00017018076263359283, + "loss": 0.0245, + "step": 1135 + }, + { + "epoch": 0.30057346252719, + "grad_norm": 0.26171875, + "learning_rate": 0.0001700488191054229, + "loss": 0.0247, + "step": 1140 + }, + { + "epoch": 0.301891767187397, + "grad_norm": 0.40625, + "learning_rate": 0.00016991687557725294, + "loss": 0.0402, + "step": 1145 + }, + { + "epoch": 0.30321007184760396, + "grad_norm": 1.2578125, + "learning_rate": 0.000169784932049083, + "loss": 0.0071, + "step": 1150 + }, + { + "epoch": 0.30452837650781095, + "grad_norm": 0.330078125, + "learning_rate": 0.00016965298852091306, + "loss": 0.0177, + "step": 1155 + }, + { + "epoch": 0.30584668116801794, + "grad_norm": 0.07275390625, + "learning_rate": 0.0001695210449927431, + "loss": 0.0029, + "step": 1160 + }, + { + "epoch": 0.3071649858282249, + "grad_norm": 0.455078125, + "learning_rate": 0.00016938910146457317, + "loss": 0.0262, + "step": 1165 + }, + { + "epoch": 0.30848329048843187, + "grad_norm": 0.002655029296875, + "learning_rate": 0.0001692571579364032, + "loss": 0.0346, + "step": 1170 + }, + { + "epoch": 0.30980159514863886, + "grad_norm": 0.1748046875, + "learning_rate": 0.00016912521440823328, + "loss": 0.0494, + "step": 1175 + }, + { + "epoch": 0.3111198998088458, + "grad_norm": 1.4609375, + "learning_rate": 0.00016899327088006335, + "loss": 0.0603, + "step": 1180 + }, + { + "epoch": 0.3124382044690528, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001688613273518934, + "loss": 0.0366, + "step": 1185 + }, + { + "epoch": 0.3137565091292598, + "grad_norm": 0.01422119140625, + "learning_rate": 0.00016872938382372346, + "loss": 0.0678, + "step": 1190 + }, + { + "epoch": 0.3150748137894667, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001685974402955535, + "loss": 0.0359, + "step": 1195 + }, + { + "epoch": 0.3163931184496737, + "grad_norm": 0.275390625, + "learning_rate": 0.00016846549676738357, + "loss": 0.1099, + "step": 1200 + }, + { + "epoch": 0.3177114231098807, + "grad_norm": 0.212890625, + "learning_rate": 0.00016833355323921364, + "loss": 0.0343, + "step": 1205 + }, + { + "epoch": 0.31902972777008765, + "grad_norm": 0.0302734375, + "learning_rate": 0.00016820160971104368, + "loss": 0.0138, + "step": 1210 + }, + { + "epoch": 0.32034803243029464, + "grad_norm": 0.016845703125, + "learning_rate": 0.00016806966618287375, + "loss": 0.0202, + "step": 1215 + }, + { + "epoch": 0.32166633709050163, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001679377226547038, + "loss": 0.0442, + "step": 1220 + }, + { + "epoch": 0.32298464175070857, + "grad_norm": 0.049072265625, + "learning_rate": 0.00016780577912653386, + "loss": 0.0375, + "step": 1225 + }, + { + "epoch": 0.32430294641091556, + "grad_norm": 0.1337890625, + "learning_rate": 0.0001676738355983639, + "loss": 0.01, + "step": 1230 + }, + { + "epoch": 0.32562125107112255, + "grad_norm": 0.02197265625, + "learning_rate": 0.00016754189207019397, + "loss": 0.0139, + "step": 1235 + }, + { + "epoch": 0.3269395557313295, + "grad_norm": 0.09228515625, + "learning_rate": 0.00016740994854202404, + "loss": 0.014, + "step": 1240 + }, + { + "epoch": 0.3282578603915365, + "grad_norm": 0.47265625, + "learning_rate": 0.00016727800501385408, + "loss": 0.1546, + "step": 1245 + }, + { + "epoch": 0.3295761650517435, + "grad_norm": 0.02294921875, + "learning_rate": 0.00016714606148568413, + "loss": 0.0803, + "step": 1250 + }, + { + "epoch": 0.3308944697119504, + "grad_norm": 0.185546875, + "learning_rate": 0.00016701411795751417, + "loss": 0.0376, + "step": 1255 + }, + { + "epoch": 0.3322127743721574, + "grad_norm": 0.1123046875, + "learning_rate": 0.00016688217442934424, + "loss": 0.0375, + "step": 1260 + }, + { + "epoch": 0.3335310790323644, + "grad_norm": 1.03125, + "learning_rate": 0.0001667502309011743, + "loss": 0.0442, + "step": 1265 + }, + { + "epoch": 0.33484938369257133, + "grad_norm": 0.0172119140625, + "learning_rate": 0.00016661828737300435, + "loss": 0.0261, + "step": 1270 + }, + { + "epoch": 0.3361676883527783, + "grad_norm": 0.42578125, + "learning_rate": 0.00016648634384483442, + "loss": 0.0553, + "step": 1275 + }, + { + "epoch": 0.3374859930129853, + "grad_norm": 0.1328125, + "learning_rate": 0.00016635440031666446, + "loss": 0.0065, + "step": 1280 + }, + { + "epoch": 0.33880429767319226, + "grad_norm": 0.263671875, + "learning_rate": 0.00016622245678849453, + "loss": 0.0527, + "step": 1285 + }, + { + "epoch": 0.34012260233339925, + "grad_norm": 0.314453125, + "learning_rate": 0.0001660905132603246, + "loss": 0.0297, + "step": 1290 + }, + { + "epoch": 0.34144090699360624, + "grad_norm": 0.04345703125, + "learning_rate": 0.00016595856973215464, + "loss": 0.0477, + "step": 1295 + }, + { + "epoch": 0.3427592116538132, + "grad_norm": 0.08154296875, + "learning_rate": 0.0001658266262039847, + "loss": 0.0298, + "step": 1300 + }, + { + "epoch": 0.34407751631402017, + "grad_norm": 0.08935546875, + "learning_rate": 0.00016569468267581475, + "loss": 0.0481, + "step": 1305 + }, + { + "epoch": 0.34539582097422716, + "grad_norm": 0.06640625, + "learning_rate": 0.00016556273914764482, + "loss": 0.0153, + "step": 1310 + }, + { + "epoch": 0.3467141256344341, + "grad_norm": 0.00592041015625, + "learning_rate": 0.00016543079561947486, + "loss": 0.0111, + "step": 1315 + }, + { + "epoch": 0.3480324302946411, + "grad_norm": 0.2236328125, + "learning_rate": 0.00016529885209130493, + "loss": 0.0309, + "step": 1320 + }, + { + "epoch": 0.3493507349548481, + "grad_norm": 0.0198974609375, + "learning_rate": 0.000165166908563135, + "loss": 0.0579, + "step": 1325 + }, + { + "epoch": 0.350669039615055, + "grad_norm": 0.10107421875, + "learning_rate": 0.00016503496503496504, + "loss": 0.0055, + "step": 1330 + }, + { + "epoch": 0.351987344275262, + "grad_norm": 0.71875, + "learning_rate": 0.00016490302150679511, + "loss": 0.0299, + "step": 1335 + }, + { + "epoch": 0.353305648935469, + "grad_norm": 0.01348876953125, + "learning_rate": 0.00016477107797862516, + "loss": 0.0943, + "step": 1340 + }, + { + "epoch": 0.35462395359567594, + "grad_norm": 0.3046875, + "learning_rate": 0.00016463913445045523, + "loss": 0.0216, + "step": 1345 + }, + { + "epoch": 0.35594225825588294, + "grad_norm": 0.02392578125, + "learning_rate": 0.00016450719092228527, + "loss": 0.0265, + "step": 1350 + }, + { + "epoch": 0.35726056291608993, + "grad_norm": 0.453125, + "learning_rate": 0.0001643752473941153, + "loss": 0.0539, + "step": 1355 + }, + { + "epoch": 0.35857886757629687, + "grad_norm": 0.00823974609375, + "learning_rate": 0.00016424330386594538, + "loss": 0.0139, + "step": 1360 + }, + { + "epoch": 0.35989717223650386, + "grad_norm": 0.55859375, + "learning_rate": 0.00016411136033777542, + "loss": 0.0428, + "step": 1365 + }, + { + "epoch": 0.36121547689671085, + "grad_norm": 0.052734375, + "learning_rate": 0.0001639794168096055, + "loss": 0.0346, + "step": 1370 + }, + { + "epoch": 0.3625337815569178, + "grad_norm": 0.12158203125, + "learning_rate": 0.00016384747328143556, + "loss": 0.0095, + "step": 1375 + }, + { + "epoch": 0.3638520862171248, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0001637155297532656, + "loss": 0.0224, + "step": 1380 + }, + { + "epoch": 0.3651703908773318, + "grad_norm": 0.01318359375, + "learning_rate": 0.00016358358622509567, + "loss": 0.0316, + "step": 1385 + }, + { + "epoch": 0.3664886955375387, + "grad_norm": 0.011962890625, + "learning_rate": 0.0001634516426969257, + "loss": 0.0051, + "step": 1390 + }, + { + "epoch": 0.3678070001977457, + "grad_norm": 0.00396728515625, + "learning_rate": 0.00016331969916875578, + "loss": 0.038, + "step": 1395 + }, + { + "epoch": 0.3691253048579527, + "grad_norm": 0.375, + "learning_rate": 0.00016318775564058585, + "loss": 0.029, + "step": 1400 + }, + { + "epoch": 0.37044360951815963, + "grad_norm": 0.265625, + "learning_rate": 0.0001630558121124159, + "loss": 0.0072, + "step": 1405 + }, + { + "epoch": 0.3717619141783666, + "grad_norm": 0.00127410888671875, + "learning_rate": 0.00016292386858424596, + "loss": 0.0381, + "step": 1410 + }, + { + "epoch": 0.3730802188385736, + "grad_norm": 1.15625, + "learning_rate": 0.000162791925056076, + "loss": 0.0573, + "step": 1415 + }, + { + "epoch": 0.37439852349878056, + "grad_norm": 0.0244140625, + "learning_rate": 0.00016265998152790607, + "loss": 0.051, + "step": 1420 + }, + { + "epoch": 0.37571682815898755, + "grad_norm": 0.0015106201171875, + "learning_rate": 0.00016252803799973612, + "loss": 0.0239, + "step": 1425 + }, + { + "epoch": 0.37703513281919454, + "grad_norm": 0.26953125, + "learning_rate": 0.00016239609447156618, + "loss": 0.0165, + "step": 1430 + }, + { + "epoch": 0.3783534374794015, + "grad_norm": 0.006134033203125, + "learning_rate": 0.00016226415094339625, + "loss": 0.0071, + "step": 1435 + }, + { + "epoch": 0.37967174213960847, + "grad_norm": 2.828125, + "learning_rate": 0.0001621322074152263, + "loss": 0.0272, + "step": 1440 + }, + { + "epoch": 0.38099004679981546, + "grad_norm": 0.349609375, + "learning_rate": 0.00016200026388705637, + "loss": 0.0647, + "step": 1445 + }, + { + "epoch": 0.3823083514600224, + "grad_norm": 0.09326171875, + "learning_rate": 0.00016186832035888638, + "loss": 0.0262, + "step": 1450 + }, + { + "epoch": 0.3836266561202294, + "grad_norm": 0.041015625, + "learning_rate": 0.00016173637683071645, + "loss": 0.0576, + "step": 1455 + }, + { + "epoch": 0.3849449607804364, + "grad_norm": 0.033935546875, + "learning_rate": 0.00016160443330254652, + "loss": 0.0142, + "step": 1460 + }, + { + "epoch": 0.3862632654406433, + "grad_norm": 0.09130859375, + "learning_rate": 0.00016147248977437656, + "loss": 0.0348, + "step": 1465 + }, + { + "epoch": 0.3875815701008503, + "grad_norm": 2.390625, + "learning_rate": 0.00016134054624620663, + "loss": 0.0672, + "step": 1470 + }, + { + "epoch": 0.3888998747610573, + "grad_norm": 0.439453125, + "learning_rate": 0.00016120860271803667, + "loss": 0.0121, + "step": 1475 + }, + { + "epoch": 0.39021817942126424, + "grad_norm": 0.1298828125, + "learning_rate": 0.00016107665918986674, + "loss": 0.0114, + "step": 1480 + }, + { + "epoch": 0.39153648408147124, + "grad_norm": 0.85546875, + "learning_rate": 0.0001609447156616968, + "loss": 0.0968, + "step": 1485 + }, + { + "epoch": 0.39285478874167823, + "grad_norm": 0.703125, + "learning_rate": 0.00016081277213352685, + "loss": 0.0349, + "step": 1490 + }, + { + "epoch": 0.39417309340188517, + "grad_norm": 0.021728515625, + "learning_rate": 0.00016068082860535692, + "loss": 0.0106, + "step": 1495 + }, + { + "epoch": 0.39549139806209216, + "grad_norm": 0.7265625, + "learning_rate": 0.00016054888507718696, + "loss": 0.0225, + "step": 1500 + }, + { + "epoch": 0.39549139806209216, + "eval_loss": 0.03515048325061798, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.3497, + "eval_samples_per_second": 7.373, + "eval_steps_per_second": 3.686, + "step": 1500 + }, + { + "epoch": 0.3968097027222991, + "grad_norm": 0.016519820317626, + "learning_rate": 0.00016041694154901703, + "loss": 0.0202, + "step": 1505 + }, + { + "epoch": 0.3981280073825061, + "grad_norm": 0.8505942225456238, + "learning_rate": 0.00016028499802084708, + "loss": 0.0541, + "step": 1510 + }, + { + "epoch": 0.3994463120427131, + "grad_norm": 0.04163295030593872, + "learning_rate": 0.00016015305449267714, + "loss": 0.0037, + "step": 1515 + }, + { + "epoch": 0.40076461670292, + "grad_norm": 0.011332935653626919, + "learning_rate": 0.00016002111096450721, + "loss": 0.0459, + "step": 1520 + }, + { + "epoch": 0.402082921363127, + "grad_norm": 0.9360129833221436, + "learning_rate": 0.00015988916743633726, + "loss": 0.013, + "step": 1525 + }, + { + "epoch": 0.403401226023334, + "grad_norm": 0.11991436779499054, + "learning_rate": 0.00015975722390816733, + "loss": 0.0079, + "step": 1530 + }, + { + "epoch": 0.40471953068354094, + "grad_norm": 0.36911076307296753, + "learning_rate": 0.00015962528037999737, + "loss": 0.0638, + "step": 1535 + }, + { + "epoch": 0.40603783534374793, + "grad_norm": 0.020278634503483772, + "learning_rate": 0.00015949333685182744, + "loss": 0.0217, + "step": 1540 + }, + { + "epoch": 0.4073561400039549, + "grad_norm": 0.14263059198856354, + "learning_rate": 0.0001593613933236575, + "loss": 0.0495, + "step": 1545 + }, + { + "epoch": 0.40867444466416186, + "grad_norm": 0.09494803845882416, + "learning_rate": 0.00015922944979548752, + "loss": 0.0248, + "step": 1550 + }, + { + "epoch": 0.40999274932436885, + "grad_norm": 0.23064319789409637, + "learning_rate": 0.0001590975062673176, + "loss": 0.0285, + "step": 1555 + }, + { + "epoch": 0.41131105398457585, + "grad_norm": 0.32220256328582764, + "learning_rate": 0.00015896556273914763, + "loss": 0.0537, + "step": 1560 + }, + { + "epoch": 0.4126293586447828, + "grad_norm": 0.41208815574645996, + "learning_rate": 0.0001588336192109777, + "loss": 0.0453, + "step": 1565 + }, + { + "epoch": 0.4139476633049898, + "grad_norm": 0.03775424137711525, + "learning_rate": 0.00015870167568280777, + "loss": 0.0134, + "step": 1570 + }, + { + "epoch": 0.41526596796519677, + "grad_norm": 0.6526333093643188, + "learning_rate": 0.0001585697321546378, + "loss": 0.0329, + "step": 1575 + }, + { + "epoch": 0.4165842726254037, + "grad_norm": 1.001305103302002, + "learning_rate": 0.00015843778862646788, + "loss": 0.0912, + "step": 1580 + }, + { + "epoch": 0.4179025772856107, + "grad_norm": 0.4055219888687134, + "learning_rate": 0.00015830584509829792, + "loss": 0.0519, + "step": 1585 + }, + { + "epoch": 0.4192208819458177, + "grad_norm": 0.035015616565942764, + "learning_rate": 0.000158173901570128, + "loss": 0.0191, + "step": 1590 + }, + { + "epoch": 0.42053918660602463, + "grad_norm": 0.09326844662427902, + "learning_rate": 0.00015804195804195806, + "loss": 0.0106, + "step": 1595 + }, + { + "epoch": 0.4218574912662316, + "grad_norm": 0.06223440542817116, + "learning_rate": 0.0001579100145137881, + "loss": 0.0113, + "step": 1600 + }, + { + "epoch": 0.4231757959264386, + "grad_norm": 0.0625135526061058, + "learning_rate": 0.00015777807098561817, + "loss": 0.0191, + "step": 1605 + }, + { + "epoch": 0.42449410058664555, + "grad_norm": 0.2645983099937439, + "learning_rate": 0.00015764612745744822, + "loss": 0.0829, + "step": 1610 + }, + { + "epoch": 0.42581240524685254, + "grad_norm": 0.009632415138185024, + "learning_rate": 0.00015751418392927829, + "loss": 0.0542, + "step": 1615 + }, + { + "epoch": 0.42713070990705954, + "grad_norm": 0.01979319378733635, + "learning_rate": 0.00015738224040110833, + "loss": 0.0517, + "step": 1620 + }, + { + "epoch": 0.4284490145672665, + "grad_norm": 0.3065454065799713, + "learning_rate": 0.0001572502968729384, + "loss": 0.0738, + "step": 1625 + }, + { + "epoch": 0.42976731922747347, + "grad_norm": 0.09581473469734192, + "learning_rate": 0.00015711835334476847, + "loss": 0.0571, + "step": 1630 + }, + { + "epoch": 0.43108562388768046, + "grad_norm": 0.23746591806411743, + "learning_rate": 0.0001569864098165985, + "loss": 0.0128, + "step": 1635 + }, + { + "epoch": 0.4324039285478874, + "grad_norm": 0.936278760433197, + "learning_rate": 0.00015685446628842858, + "loss": 0.0665, + "step": 1640 + }, + { + "epoch": 0.4337222332080944, + "grad_norm": 0.18487441539764404, + "learning_rate": 0.00015672252276025862, + "loss": 0.0527, + "step": 1645 + }, + { + "epoch": 0.4350405378683014, + "grad_norm": 0.6980624794960022, + "learning_rate": 0.00015659057923208866, + "loss": 0.0613, + "step": 1650 + }, + { + "epoch": 0.4363588425285083, + "grad_norm": 0.4696301221847534, + "learning_rate": 0.00015645863570391873, + "loss": 0.0569, + "step": 1655 + }, + { + "epoch": 0.4376771471887153, + "grad_norm": 0.15083105862140656, + "learning_rate": 0.00015632669217574877, + "loss": 0.0394, + "step": 1660 + }, + { + "epoch": 0.4389954518489223, + "grad_norm": 0.44701239466667175, + "learning_rate": 0.00015619474864757884, + "loss": 0.0494, + "step": 1665 + }, + { + "epoch": 0.44031375650912924, + "grad_norm": 0.07418403029441833, + "learning_rate": 0.00015606280511940888, + "loss": 0.0291, + "step": 1670 + }, + { + "epoch": 0.44163206116933623, + "grad_norm": 0.02311861515045166, + "learning_rate": 0.00015593086159123895, + "loss": 0.0304, + "step": 1675 + }, + { + "epoch": 0.4429503658295432, + "grad_norm": 0.4416038990020752, + "learning_rate": 0.00015579891806306902, + "loss": 0.0176, + "step": 1680 + }, + { + "epoch": 0.44426867048975016, + "grad_norm": 0.5124915242195129, + "learning_rate": 0.00015566697453489906, + "loss": 0.0454, + "step": 1685 + }, + { + "epoch": 0.44558697514995715, + "grad_norm": 0.3159286081790924, + "learning_rate": 0.00015553503100672913, + "loss": 0.047, + "step": 1690 + }, + { + "epoch": 0.44690527981016415, + "grad_norm": 0.032126396894454956, + "learning_rate": 0.00015540308747855918, + "loss": 0.0151, + "step": 1695 + }, + { + "epoch": 0.4482235844703711, + "grad_norm": 0.04663548618555069, + "learning_rate": 0.00015527114395038924, + "loss": 0.0375, + "step": 1700 + }, + { + "epoch": 0.4495418891305781, + "grad_norm": 0.013753900304436684, + "learning_rate": 0.0001551392004222193, + "loss": 0.0485, + "step": 1705 + }, + { + "epoch": 0.45086019379078507, + "grad_norm": 1.9952393770217896, + "learning_rate": 0.00015500725689404936, + "loss": 0.0625, + "step": 1710 + }, + { + "epoch": 0.452178498450992, + "grad_norm": 0.014283270575106144, + "learning_rate": 0.00015487531336587943, + "loss": 0.0037, + "step": 1715 + }, + { + "epoch": 0.453496803111199, + "grad_norm": 0.3897913098335266, + "learning_rate": 0.00015474336983770947, + "loss": 0.0304, + "step": 1720 + }, + { + "epoch": 0.454815107771406, + "grad_norm": 0.3730885684490204, + "learning_rate": 0.00015461142630953954, + "loss": 0.0115, + "step": 1725 + }, + { + "epoch": 0.45613341243161293, + "grad_norm": 0.035858724266290665, + "learning_rate": 0.00015447948278136958, + "loss": 0.0021, + "step": 1730 + }, + { + "epoch": 0.4574517170918199, + "grad_norm": 0.20589517056941986, + "learning_rate": 0.00015434753925319965, + "loss": 0.0132, + "step": 1735 + }, + { + "epoch": 0.4587700217520269, + "grad_norm": 0.004939342383295298, + "learning_rate": 0.00015421559572502972, + "loss": 0.0471, + "step": 1740 + }, + { + "epoch": 0.46008832641223385, + "grad_norm": 0.03493283689022064, + "learning_rate": 0.00015408365219685976, + "loss": 0.0062, + "step": 1745 + }, + { + "epoch": 0.46140663107244084, + "grad_norm": 0.045927103608846664, + "learning_rate": 0.0001539517086686898, + "loss": 0.0283, + "step": 1750 + }, + { + "epoch": 0.46272493573264784, + "grad_norm": 0.012629454955458641, + "learning_rate": 0.00015381976514051984, + "loss": 0.0133, + "step": 1755 + }, + { + "epoch": 0.46404324039285477, + "grad_norm": 0.8001697659492493, + "learning_rate": 0.0001536878216123499, + "loss": 0.0224, + "step": 1760 + }, + { + "epoch": 0.46536154505306176, + "grad_norm": 0.002036362886428833, + "learning_rate": 0.00015355587808417998, + "loss": 0.0066, + "step": 1765 + }, + { + "epoch": 0.46667984971326876, + "grad_norm": 1.0261330604553223, + "learning_rate": 0.00015342393455601002, + "loss": 0.191, + "step": 1770 + }, + { + "epoch": 0.4679981543734757, + "grad_norm": 0.3033429682254791, + "learning_rate": 0.0001532919910278401, + "loss": 0.0222, + "step": 1775 + }, + { + "epoch": 0.4693164590336827, + "grad_norm": 0.36911338567733765, + "learning_rate": 0.00015316004749967014, + "loss": 0.0363, + "step": 1780 + }, + { + "epoch": 0.4706347636938897, + "grad_norm": 0.0406811460852623, + "learning_rate": 0.0001530281039715002, + "loss": 0.0283, + "step": 1785 + }, + { + "epoch": 0.4719530683540966, + "grad_norm": 0.23334211111068726, + "learning_rate": 0.00015289616044333027, + "loss": 0.0274, + "step": 1790 + }, + { + "epoch": 0.4732713730143036, + "grad_norm": 0.013081169687211514, + "learning_rate": 0.00015276421691516032, + "loss": 0.0221, + "step": 1795 + }, + { + "epoch": 0.4745896776745106, + "grad_norm": 0.2480790615081787, + "learning_rate": 0.00015263227338699039, + "loss": 0.019, + "step": 1800 + }, + { + "epoch": 0.47590798233471754, + "grad_norm": 0.0373196005821228, + "learning_rate": 0.00015250032985882043, + "loss": 0.0292, + "step": 1805 + }, + { + "epoch": 0.47722628699492453, + "grad_norm": 0.004609994124621153, + "learning_rate": 0.0001523683863306505, + "loss": 0.0918, + "step": 1810 + }, + { + "epoch": 0.4785445916551315, + "grad_norm": 0.02370987832546234, + "learning_rate": 0.00015223644280248054, + "loss": 0.0462, + "step": 1815 + }, + { + "epoch": 0.47986289631533846, + "grad_norm": 0.05842221528291702, + "learning_rate": 0.0001521044992743106, + "loss": 0.0595, + "step": 1820 + }, + { + "epoch": 0.48118120097554545, + "grad_norm": 0.009685276076197624, + "learning_rate": 0.00015197255574614068, + "loss": 0.0074, + "step": 1825 + }, + { + "epoch": 0.48249950563575245, + "grad_norm": 0.8933250308036804, + "learning_rate": 0.00015184061221797072, + "loss": 0.0757, + "step": 1830 + }, + { + "epoch": 0.4838178102959594, + "grad_norm": 0.07075401395559311, + "learning_rate": 0.0001517086686898008, + "loss": 0.0226, + "step": 1835 + }, + { + "epoch": 0.4851361149561664, + "grad_norm": 0.732706606388092, + "learning_rate": 0.00015157672516163083, + "loss": 0.0161, + "step": 1840 + }, + { + "epoch": 0.48645441961637337, + "grad_norm": 1.1897023916244507, + "learning_rate": 0.0001514447816334609, + "loss": 0.0265, + "step": 1845 + }, + { + "epoch": 0.4877727242765803, + "grad_norm": 0.052572328597307205, + "learning_rate": 0.00015131283810529094, + "loss": 0.0094, + "step": 1850 + }, + { + "epoch": 0.4890910289367873, + "grad_norm": 0.08263898640871048, + "learning_rate": 0.00015118089457712098, + "loss": 0.0631, + "step": 1855 + }, + { + "epoch": 0.4904093335969943, + "grad_norm": 0.03225664421916008, + "learning_rate": 0.00015104895104895105, + "loss": 0.023, + "step": 1860 + }, + { + "epoch": 0.4917276382572012, + "grad_norm": 0.007935039699077606, + "learning_rate": 0.0001509170075207811, + "loss": 0.0039, + "step": 1865 + }, + { + "epoch": 0.4930459429174082, + "grad_norm": 0.00830796267837286, + "learning_rate": 0.00015078506399261116, + "loss": 0.007, + "step": 1870 + }, + { + "epoch": 0.4943642475776152, + "grad_norm": 0.08042234182357788, + "learning_rate": 0.00015065312046444123, + "loss": 0.0366, + "step": 1875 + }, + { + "epoch": 0.49568255223782215, + "grad_norm": 0.009092851541936398, + "learning_rate": 0.00015052117693627128, + "loss": 0.0107, + "step": 1880 + }, + { + "epoch": 0.49700085689802914, + "grad_norm": 0.2674141824245453, + "learning_rate": 0.00015038923340810135, + "loss": 0.0076, + "step": 1885 + }, + { + "epoch": 0.49831916155823613, + "grad_norm": 0.07694366574287415, + "learning_rate": 0.0001502572898799314, + "loss": 0.0252, + "step": 1890 + }, + { + "epoch": 0.49963746621844307, + "grad_norm": 0.5699467062950134, + "learning_rate": 0.00015012534635176146, + "loss": 0.0487, + "step": 1895 + }, + { + "epoch": 0.5009557708786501, + "grad_norm": 0.18800878524780273, + "learning_rate": 0.0001499934028235915, + "loss": 0.0183, + "step": 1900 + }, + { + "epoch": 0.5022740755388571, + "grad_norm": 0.019469989463686943, + "learning_rate": 0.00014986145929542157, + "loss": 0.0268, + "step": 1905 + }, + { + "epoch": 0.503592380199064, + "grad_norm": 0.01890506222844124, + "learning_rate": 0.00014972951576725164, + "loss": 0.0449, + "step": 1910 + }, + { + "epoch": 0.5049106848592709, + "grad_norm": 0.0006314461352303624, + "learning_rate": 0.00014959757223908168, + "loss": 0.0056, + "step": 1915 + }, + { + "epoch": 0.5062289895194779, + "grad_norm": 0.32654041051864624, + "learning_rate": 0.00014946562871091175, + "loss": 0.0256, + "step": 1920 + }, + { + "epoch": 0.5075472941796849, + "grad_norm": 0.7803483605384827, + "learning_rate": 0.0001493336851827418, + "loss": 0.0374, + "step": 1925 + }, + { + "epoch": 0.5088655988398919, + "grad_norm": 0.028441445901989937, + "learning_rate": 0.00014920174165457186, + "loss": 0.0161, + "step": 1930 + }, + { + "epoch": 0.5101839035000989, + "grad_norm": 0.028379200026392937, + "learning_rate": 0.00014906979812640193, + "loss": 0.0151, + "step": 1935 + }, + { + "epoch": 0.5115022081603059, + "grad_norm": 0.021159596741199493, + "learning_rate": 0.00014893785459823197, + "loss": 0.0303, + "step": 1940 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.24903325736522675, + "learning_rate": 0.000148805911070062, + "loss": 0.0076, + "step": 1945 + }, + { + "epoch": 0.5141388174807198, + "grad_norm": 0.007065301761031151, + "learning_rate": 0.00014867396754189206, + "loss": 0.022, + "step": 1950 + }, + { + "epoch": 0.5154571221409268, + "grad_norm": 0.004032329190522432, + "learning_rate": 0.00014854202401372212, + "loss": 0.0083, + "step": 1955 + }, + { + "epoch": 0.5167754268011338, + "grad_norm": 0.3045775592327118, + "learning_rate": 0.0001484100804855522, + "loss": 0.0113, + "step": 1960 + }, + { + "epoch": 0.5180937314613407, + "grad_norm": 0.36974939703941345, + "learning_rate": 0.00014827813695738224, + "loss": 0.0267, + "step": 1965 + }, + { + "epoch": 0.5194120361215477, + "grad_norm": 0.009729950688779354, + "learning_rate": 0.0001481461934292123, + "loss": 0.027, + "step": 1970 + }, + { + "epoch": 0.5207303407817546, + "grad_norm": 0.0013097926275804639, + "learning_rate": 0.00014801424990104235, + "loss": 0.003, + "step": 1975 + }, + { + "epoch": 0.5220486454419616, + "grad_norm": 0.0706263929605484, + "learning_rate": 0.00014788230637287242, + "loss": 0.0193, + "step": 1980 + }, + { + "epoch": 0.5233669501021686, + "grad_norm": 1.435702919960022, + "learning_rate": 0.00014775036284470249, + "loss": 0.0647, + "step": 1985 + }, + { + "epoch": 0.5246852547623756, + "grad_norm": 0.00661757867783308, + "learning_rate": 0.00014761841931653253, + "loss": 0.0373, + "step": 1990 + }, + { + "epoch": 0.5260035594225826, + "grad_norm": 0.12014541029930115, + "learning_rate": 0.0001474864757883626, + "loss": 0.0178, + "step": 1995 + }, + { + "epoch": 0.5273218640827896, + "grad_norm": 1.0549248456954956, + "learning_rate": 0.00014735453226019264, + "loss": 0.0191, + "step": 2000 + }, + { + "epoch": 0.5273218640827896, + "eval_loss": 0.037292081862688065, + "eval_runtime": 454.3033, + "eval_samples_per_second": 7.422, + "eval_steps_per_second": 3.711, + "step": 2000 + }, + { + "epoch": 0.5286401687429965, + "grad_norm": 0.47634151577949524, + "learning_rate": 0.0001472225887320227, + "loss": 0.0404, + "step": 2005 + }, + { + "epoch": 0.5299584734032035, + "grad_norm": 0.006752463988959789, + "learning_rate": 0.00014709064520385275, + "loss": 0.034, + "step": 2010 + }, + { + "epoch": 0.5312767780634104, + "grad_norm": 0.20780125260353088, + "learning_rate": 0.00014695870167568282, + "loss": 0.0421, + "step": 2015 + }, + { + "epoch": 0.5325950827236174, + "grad_norm": 0.010941066779196262, + "learning_rate": 0.0001468267581475129, + "loss": 0.0086, + "step": 2020 + }, + { + "epoch": 0.5339133873838244, + "grad_norm": 0.3439581096172333, + "learning_rate": 0.00014669481461934293, + "loss": 0.0187, + "step": 2025 + }, + { + "epoch": 0.5352316920440314, + "grad_norm": 0.14961636066436768, + "learning_rate": 0.000146562871091173, + "loss": 0.0504, + "step": 2030 + }, + { + "epoch": 0.5365499967042383, + "grad_norm": 0.0044641937129199505, + "learning_rate": 0.00014643092756300304, + "loss": 0.0134, + "step": 2035 + }, + { + "epoch": 0.5378683013644453, + "grad_norm": 0.14088386297225952, + "learning_rate": 0.0001462989840348331, + "loss": 0.0096, + "step": 2040 + }, + { + "epoch": 0.5391866060246523, + "grad_norm": 0.48116979002952576, + "learning_rate": 0.00014616704050666315, + "loss": 0.0124, + "step": 2045 + }, + { + "epoch": 0.5405049106848593, + "grad_norm": 0.3688766360282898, + "learning_rate": 0.0001460350969784932, + "loss": 0.0226, + "step": 2050 + }, + { + "epoch": 0.5418232153450663, + "grad_norm": 0.002938181860372424, + "learning_rate": 0.00014590315345032326, + "loss": 0.0267, + "step": 2055 + }, + { + "epoch": 0.5431415200052733, + "grad_norm": 0.3335214853286743, + "learning_rate": 0.0001457712099221533, + "loss": 0.0367, + "step": 2060 + }, + { + "epoch": 0.5444598246654802, + "grad_norm": 0.004644686821848154, + "learning_rate": 0.00014563926639398338, + "loss": 0.0121, + "step": 2065 + }, + { + "epoch": 0.5457781293256871, + "grad_norm": 0.19505545496940613, + "learning_rate": 0.00014550732286581345, + "loss": 0.0591, + "step": 2070 + }, + { + "epoch": 0.5470964339858941, + "grad_norm": 0.018028756603598595, + "learning_rate": 0.0001453753793376435, + "loss": 0.0131, + "step": 2075 + }, + { + "epoch": 0.5484147386461011, + "grad_norm": 0.045639291405677795, + "learning_rate": 0.00014524343580947356, + "loss": 0.0443, + "step": 2080 + }, + { + "epoch": 0.5497330433063081, + "grad_norm": 0.727981686592102, + "learning_rate": 0.0001451114922813036, + "loss": 0.0205, + "step": 2085 + }, + { + "epoch": 0.5510513479665151, + "grad_norm": 0.03766491636633873, + "learning_rate": 0.00014497954875313367, + "loss": 0.0067, + "step": 2090 + }, + { + "epoch": 0.552369652626722, + "grad_norm": 0.1911504715681076, + "learning_rate": 0.0001448476052249637, + "loss": 0.0397, + "step": 2095 + }, + { + "epoch": 0.553687957286929, + "grad_norm": 0.08238353580236435, + "learning_rate": 0.00014471566169679378, + "loss": 0.0513, + "step": 2100 + }, + { + "epoch": 0.555006261947136, + "grad_norm": 0.06317206472158432, + "learning_rate": 0.00014458371816862385, + "loss": 0.0178, + "step": 2105 + }, + { + "epoch": 0.556324566607343, + "grad_norm": 0.0652734637260437, + "learning_rate": 0.0001444517746404539, + "loss": 0.0184, + "step": 2110 + }, + { + "epoch": 0.55764287126755, + "grad_norm": 0.05471858009696007, + "learning_rate": 0.00014431983111228396, + "loss": 0.0089, + "step": 2115 + }, + { + "epoch": 0.558961175927757, + "grad_norm": 0.005062670446932316, + "learning_rate": 0.000144187887584114, + "loss": 0.0052, + "step": 2120 + }, + { + "epoch": 0.5602794805879638, + "grad_norm": 0.06337414681911469, + "learning_rate": 0.00014405594405594407, + "loss": 0.053, + "step": 2125 + }, + { + "epoch": 0.5615977852481708, + "grad_norm": 0.33745357394218445, + "learning_rate": 0.00014392400052777414, + "loss": 0.0166, + "step": 2130 + }, + { + "epoch": 0.5629160899083778, + "grad_norm": 0.7382741570472717, + "learning_rate": 0.00014379205699960418, + "loss": 0.0191, + "step": 2135 + }, + { + "epoch": 0.5642343945685848, + "grad_norm": 0.007551972754299641, + "learning_rate": 0.00014366011347143425, + "loss": 0.0022, + "step": 2140 + }, + { + "epoch": 0.5655526992287918, + "grad_norm": 0.6260896921157837, + "learning_rate": 0.00014352816994326427, + "loss": 0.0095, + "step": 2145 + }, + { + "epoch": 0.5668710038889987, + "grad_norm": 0.11619322001934052, + "learning_rate": 0.00014339622641509434, + "loss": 0.015, + "step": 2150 + }, + { + "epoch": 0.5681893085492057, + "grad_norm": 1.1440670490264893, + "learning_rate": 0.0001432642828869244, + "loss": 0.1343, + "step": 2155 + }, + { + "epoch": 0.5695076132094127, + "grad_norm": 1.1793878078460693, + "learning_rate": 0.00014313233935875445, + "loss": 0.0968, + "step": 2160 + }, + { + "epoch": 0.5708259178696197, + "grad_norm": 0.6865736842155457, + "learning_rate": 0.00014300039583058452, + "loss": 0.0195, + "step": 2165 + }, + { + "epoch": 0.5721442225298267, + "grad_norm": 0.140816792845726, + "learning_rate": 0.00014286845230241456, + "loss": 0.0761, + "step": 2170 + }, + { + "epoch": 0.5734625271900337, + "grad_norm": 0.04071786254644394, + "learning_rate": 0.00014273650877424463, + "loss": 0.0193, + "step": 2175 + }, + { + "epoch": 0.5747808318502405, + "grad_norm": 0.044617727398872375, + "learning_rate": 0.0001426045652460747, + "loss": 0.0112, + "step": 2180 + }, + { + "epoch": 0.5760991365104475, + "grad_norm": 0.11001799255609512, + "learning_rate": 0.00014247262171790474, + "loss": 0.0039, + "step": 2185 + }, + { + "epoch": 0.5774174411706545, + "grad_norm": 0.0036315324250608683, + "learning_rate": 0.0001423406781897348, + "loss": 0.0038, + "step": 2190 + }, + { + "epoch": 0.5787357458308615, + "grad_norm": 0.9866570830345154, + "learning_rate": 0.00014220873466156485, + "loss": 0.025, + "step": 2195 + }, + { + "epoch": 0.5800540504910685, + "grad_norm": 0.023570384830236435, + "learning_rate": 0.00014207679113339492, + "loss": 0.0468, + "step": 2200 + }, + { + "epoch": 0.5813723551512755, + "grad_norm": 0.20010559260845184, + "learning_rate": 0.00014194484760522496, + "loss": 0.0198, + "step": 2205 + }, + { + "epoch": 0.5826906598114824, + "grad_norm": 0.06153270602226257, + "learning_rate": 0.00014181290407705503, + "loss": 0.0764, + "step": 2210 + }, + { + "epoch": 0.5840089644716894, + "grad_norm": 0.033162448555231094, + "learning_rate": 0.0001416809605488851, + "loss": 0.028, + "step": 2215 + }, + { + "epoch": 0.5853272691318964, + "grad_norm": 0.428382933139801, + "learning_rate": 0.00014154901702071514, + "loss": 0.0652, + "step": 2220 + }, + { + "epoch": 0.5866455737921034, + "grad_norm": 0.25004762411117554, + "learning_rate": 0.0001414170734925452, + "loss": 0.0411, + "step": 2225 + }, + { + "epoch": 0.5879638784523104, + "grad_norm": 0.22649863362312317, + "learning_rate": 0.00014128512996437525, + "loss": 0.0517, + "step": 2230 + }, + { + "epoch": 0.5892821831125173, + "grad_norm": 0.035932112485170364, + "learning_rate": 0.00014115318643620532, + "loss": 0.015, + "step": 2235 + }, + { + "epoch": 0.5906004877727242, + "grad_norm": 0.3800172507762909, + "learning_rate": 0.00014102124290803536, + "loss": 0.0324, + "step": 2240 + }, + { + "epoch": 0.5919187924329312, + "grad_norm": 0.6974118947982788, + "learning_rate": 0.0001408892993798654, + "loss": 0.0216, + "step": 2245 + }, + { + "epoch": 0.5932370970931382, + "grad_norm": 0.15472032129764557, + "learning_rate": 0.00014075735585169548, + "loss": 0.0164, + "step": 2250 + }, + { + "epoch": 0.5945554017533452, + "grad_norm": 0.015000814571976662, + "learning_rate": 0.00014062541232352552, + "loss": 0.0395, + "step": 2255 + }, + { + "epoch": 0.5958737064135522, + "grad_norm": 0.052086081355810165, + "learning_rate": 0.0001404934687953556, + "loss": 0.0032, + "step": 2260 + }, + { + "epoch": 0.5971920110737592, + "grad_norm": 0.004600350745022297, + "learning_rate": 0.00014036152526718566, + "loss": 0.0056, + "step": 2265 + }, + { + "epoch": 0.5985103157339661, + "grad_norm": 0.4940958321094513, + "learning_rate": 0.0001402295817390157, + "loss": 0.0206, + "step": 2270 + }, + { + "epoch": 0.5998286203941731, + "grad_norm": 0.09658394008874893, + "learning_rate": 0.00014009763821084577, + "loss": 0.0052, + "step": 2275 + }, + { + "epoch": 0.60114692505438, + "grad_norm": 0.00020539117394946516, + "learning_rate": 0.0001399656946826758, + "loss": 0.087, + "step": 2280 + }, + { + "epoch": 0.602465229714587, + "grad_norm": 0.1871018409729004, + "learning_rate": 0.00013983375115450588, + "loss": 0.0812, + "step": 2285 + }, + { + "epoch": 0.603783534374794, + "grad_norm": 0.02583954855799675, + "learning_rate": 0.00013970180762633592, + "loss": 0.0232, + "step": 2290 + }, + { + "epoch": 0.605101839035001, + "grad_norm": 1.2103784084320068, + "learning_rate": 0.000139569864098166, + "loss": 0.0151, + "step": 2295 + }, + { + "epoch": 0.6064201436952079, + "grad_norm": 0.023514943197369576, + "learning_rate": 0.00013943792056999606, + "loss": 0.0193, + "step": 2300 + }, + { + "epoch": 0.6077384483554149, + "grad_norm": 0.0076395305804908276, + "learning_rate": 0.0001393059770418261, + "loss": 0.0379, + "step": 2305 + }, + { + "epoch": 0.6090567530156219, + "grad_norm": 0.12412039190530777, + "learning_rate": 0.00013917403351365617, + "loss": 0.0095, + "step": 2310 + }, + { + "epoch": 0.6103750576758289, + "grad_norm": 0.021904783323407173, + "learning_rate": 0.0001390420899854862, + "loss": 0.0166, + "step": 2315 + }, + { + "epoch": 0.6116933623360359, + "grad_norm": 0.004012851510196924, + "learning_rate": 0.00013891014645731628, + "loss": 0.0103, + "step": 2320 + }, + { + "epoch": 0.6130116669962429, + "grad_norm": 0.007267913781106472, + "learning_rate": 0.00013877820292914635, + "loss": 0.0708, + "step": 2325 + }, + { + "epoch": 0.6143299716564498, + "grad_norm": 0.10363642126321793, + "learning_rate": 0.0001386462594009764, + "loss": 0.0473, + "step": 2330 + }, + { + "epoch": 0.6156482763166568, + "grad_norm": 0.04899830371141434, + "learning_rate": 0.00013851431587280646, + "loss": 0.0283, + "step": 2335 + }, + { + "epoch": 0.6169665809768637, + "grad_norm": 0.39460498094558716, + "learning_rate": 0.0001383823723446365, + "loss": 0.0597, + "step": 2340 + }, + { + "epoch": 0.6182848856370707, + "grad_norm": 0.04092290997505188, + "learning_rate": 0.00013825042881646655, + "loss": 0.0167, + "step": 2345 + }, + { + "epoch": 0.6196031902972777, + "grad_norm": 0.2781132161617279, + "learning_rate": 0.00013811848528829662, + "loss": 0.0097, + "step": 2350 + }, + { + "epoch": 0.6209214949574847, + "grad_norm": 0.041443537920713425, + "learning_rate": 0.00013798654176012666, + "loss": 0.0226, + "step": 2355 + }, + { + "epoch": 0.6222397996176916, + "grad_norm": 0.1242462694644928, + "learning_rate": 0.00013785459823195673, + "loss": 0.0055, + "step": 2360 + }, + { + "epoch": 0.6235581042778986, + "grad_norm": 0.4440467357635498, + "learning_rate": 0.00013772265470378677, + "loss": 0.049, + "step": 2365 + }, + { + "epoch": 0.6248764089381056, + "grad_norm": 0.014354427345097065, + "learning_rate": 0.00013759071117561684, + "loss": 0.0327, + "step": 2370 + }, + { + "epoch": 0.6261947135983126, + "grad_norm": 0.011539973318576813, + "learning_rate": 0.0001374587676474469, + "loss": 0.0222, + "step": 2375 + }, + { + "epoch": 0.6275130182585196, + "grad_norm": 0.23539051413536072, + "learning_rate": 0.00013732682411927695, + "loss": 0.0816, + "step": 2380 + }, + { + "epoch": 0.6288313229187266, + "grad_norm": 0.26793941855430603, + "learning_rate": 0.00013719488059110702, + "loss": 0.0325, + "step": 2385 + }, + { + "epoch": 0.6301496275789334, + "grad_norm": 0.01662217453122139, + "learning_rate": 0.00013706293706293706, + "loss": 0.0221, + "step": 2390 + }, + { + "epoch": 0.6314679322391404, + "grad_norm": 0.30669671297073364, + "learning_rate": 0.00013693099353476713, + "loss": 0.026, + "step": 2395 + }, + { + "epoch": 0.6327862368993474, + "grad_norm": 0.03350894898176193, + "learning_rate": 0.00013679905000659717, + "loss": 0.0072, + "step": 2400 + }, + { + "epoch": 0.6341045415595544, + "grad_norm": 0.014983875676989555, + "learning_rate": 0.00013666710647842724, + "loss": 0.049, + "step": 2405 + }, + { + "epoch": 0.6354228462197614, + "grad_norm": 1.8989384174346924, + "learning_rate": 0.0001365351629502573, + "loss": 0.0335, + "step": 2410 + }, + { + "epoch": 0.6367411508799684, + "grad_norm": 0.030135562643408775, + "learning_rate": 0.00013640321942208735, + "loss": 0.0051, + "step": 2415 + }, + { + "epoch": 0.6380594555401753, + "grad_norm": 0.02079075388610363, + "learning_rate": 0.00013627127589391742, + "loss": 0.0138, + "step": 2420 + }, + { + "epoch": 0.6393777602003823, + "grad_norm": 0.06065403297543526, + "learning_rate": 0.00013613933236574746, + "loss": 0.0357, + "step": 2425 + }, + { + "epoch": 0.6406960648605893, + "grad_norm": 0.2980937659740448, + "learning_rate": 0.00013600738883757753, + "loss": 0.0138, + "step": 2430 + }, + { + "epoch": 0.6420143695207963, + "grad_norm": 0.4820438623428345, + "learning_rate": 0.00013587544530940758, + "loss": 0.01, + "step": 2435 + }, + { + "epoch": 0.6433326741810033, + "grad_norm": 0.005618259310722351, + "learning_rate": 0.00013574350178123765, + "loss": 0.0052, + "step": 2440 + }, + { + "epoch": 0.6446509788412103, + "grad_norm": 0.7173821926116943, + "learning_rate": 0.0001356115582530677, + "loss": 0.0133, + "step": 2445 + }, + { + "epoch": 0.6459692835014171, + "grad_norm": 0.0053142281249165535, + "learning_rate": 0.00013547961472489773, + "loss": 0.0045, + "step": 2450 + }, + { + "epoch": 0.6472875881616241, + "grad_norm": 0.06118829548358917, + "learning_rate": 0.0001353476711967278, + "loss": 0.056, + "step": 2455 + }, + { + "epoch": 0.6486058928218311, + "grad_norm": 3.5878078937530518, + "learning_rate": 0.00013521572766855787, + "loss": 0.0232, + "step": 2460 + }, + { + "epoch": 0.6499241974820381, + "grad_norm": 0.004911276511847973, + "learning_rate": 0.0001350837841403879, + "loss": 0.0074, + "step": 2465 + }, + { + "epoch": 0.6512425021422451, + "grad_norm": 0.0028026222717016935, + "learning_rate": 0.00013495184061221798, + "loss": 0.0782, + "step": 2470 + }, + { + "epoch": 0.6525608068024521, + "grad_norm": 0.7317615747451782, + "learning_rate": 0.00013481989708404802, + "loss": 0.0222, + "step": 2475 + }, + { + "epoch": 0.653879111462659, + "grad_norm": 0.01835751160979271, + "learning_rate": 0.0001346879535558781, + "loss": 0.0661, + "step": 2480 + }, + { + "epoch": 0.655197416122866, + "grad_norm": 0.03598962351679802, + "learning_rate": 0.00013455601002770813, + "loss": 0.0395, + "step": 2485 + }, + { + "epoch": 0.656515720783073, + "grad_norm": 0.013886351138353348, + "learning_rate": 0.0001344240664995382, + "loss": 0.0156, + "step": 2490 + }, + { + "epoch": 0.65783402544328, + "grad_norm": 5.741530895233154, + "learning_rate": 0.00013429212297136827, + "loss": 0.0317, + "step": 2495 + }, + { + "epoch": 0.659152330103487, + "grad_norm": 0.20793496072292328, + "learning_rate": 0.0001341601794431983, + "loss": 0.0072, + "step": 2500 + }, + { + "epoch": 0.659152330103487, + "eval_loss": 0.0300898440182209, + "eval_runtime": 453.0554, + "eval_samples_per_second": 7.443, + "eval_steps_per_second": 3.721, + "step": 2500 + }, + { + "epoch": 0.6604706347636939, + "grad_norm": 0.03460961952805519, + "learning_rate": 0.00013402823591502838, + "loss": 0.0097, + "step": 2505 + }, + { + "epoch": 0.6617889394239008, + "grad_norm": 0.31785696744918823, + "learning_rate": 0.00013389629238685842, + "loss": 0.0303, + "step": 2510 + }, + { + "epoch": 0.6631072440841078, + "grad_norm": 0.4273851215839386, + "learning_rate": 0.0001337643488586885, + "loss": 0.0499, + "step": 2515 + }, + { + "epoch": 0.6644255487443148, + "grad_norm": 0.02236153744161129, + "learning_rate": 0.00013363240533051856, + "loss": 0.0069, + "step": 2520 + }, + { + "epoch": 0.6657438534045218, + "grad_norm": 0.1592864990234375, + "learning_rate": 0.0001335004618023486, + "loss": 0.0326, + "step": 2525 + }, + { + "epoch": 0.6670621580647288, + "grad_norm": 0.029961545020341873, + "learning_rate": 0.00013336851827417867, + "loss": 0.0178, + "step": 2530 + }, + { + "epoch": 0.6683804627249358, + "grad_norm": 0.03120764158666134, + "learning_rate": 0.00013323657474600872, + "loss": 0.115, + "step": 2535 + }, + { + "epoch": 0.6696987673851427, + "grad_norm": 0.01060028001666069, + "learning_rate": 0.00013310463121783879, + "loss": 0.0036, + "step": 2540 + }, + { + "epoch": 0.6710170720453497, + "grad_norm": 0.053470809012651443, + "learning_rate": 0.00013297268768966883, + "loss": 0.0079, + "step": 2545 + }, + { + "epoch": 0.6723353767055567, + "grad_norm": 0.022777097299695015, + "learning_rate": 0.00013284074416149887, + "loss": 0.0078, + "step": 2550 + }, + { + "epoch": 0.6736536813657636, + "grad_norm": 0.0548521913588047, + "learning_rate": 0.00013270880063332894, + "loss": 0.0503, + "step": 2555 + }, + { + "epoch": 0.6749719860259706, + "grad_norm": 0.02028457075357437, + "learning_rate": 0.00013257685710515898, + "loss": 0.0096, + "step": 2560 + }, + { + "epoch": 0.6762902906861776, + "grad_norm": 0.01569107361137867, + "learning_rate": 0.00013244491357698905, + "loss": 0.008, + "step": 2565 + }, + { + "epoch": 0.6776085953463845, + "grad_norm": 0.00743742985650897, + "learning_rate": 0.00013231297004881912, + "loss": 0.005, + "step": 2570 + }, + { + "epoch": 0.6789269000065915, + "grad_norm": 0.025164416059851646, + "learning_rate": 0.00013218102652064916, + "loss": 0.018, + "step": 2575 + }, + { + "epoch": 0.6802452046667985, + "grad_norm": 0.3653188645839691, + "learning_rate": 0.00013204908299247923, + "loss": 0.0295, + "step": 2580 + }, + { + "epoch": 0.6815635093270055, + "grad_norm": 0.685422956943512, + "learning_rate": 0.00013191713946430927, + "loss": 0.0335, + "step": 2585 + }, + { + "epoch": 0.6828818139872125, + "grad_norm": 0.675740122795105, + "learning_rate": 0.00013178519593613934, + "loss": 0.0592, + "step": 2590 + }, + { + "epoch": 0.6842001186474194, + "grad_norm": 0.10513252764940262, + "learning_rate": 0.00013165325240796938, + "loss": 0.0353, + "step": 2595 + }, + { + "epoch": 0.6855184233076264, + "grad_norm": 0.43512973189353943, + "learning_rate": 0.00013152130887979945, + "loss": 0.0142, + "step": 2600 + }, + { + "epoch": 0.6868367279678333, + "grad_norm": 0.029436839744448662, + "learning_rate": 0.00013138936535162952, + "loss": 0.0042, + "step": 2605 + }, + { + "epoch": 0.6881550326280403, + "grad_norm": 0.5607122778892517, + "learning_rate": 0.00013125742182345957, + "loss": 0.0184, + "step": 2610 + }, + { + "epoch": 0.6894733372882473, + "grad_norm": 0.11365406215190887, + "learning_rate": 0.00013112547829528963, + "loss": 0.006, + "step": 2615 + }, + { + "epoch": 0.6907916419484543, + "grad_norm": 0.047227244824171066, + "learning_rate": 0.00013099353476711968, + "loss": 0.008, + "step": 2620 + }, + { + "epoch": 0.6921099466086612, + "grad_norm": 0.0005877618095837533, + "learning_rate": 0.00013086159123894975, + "loss": 0.0286, + "step": 2625 + }, + { + "epoch": 0.6934282512688682, + "grad_norm": 0.010759112425148487, + "learning_rate": 0.0001307296477107798, + "loss": 0.0062, + "step": 2630 + }, + { + "epoch": 0.6947465559290752, + "grad_norm": 0.07117745280265808, + "learning_rate": 0.00013059770418260986, + "loss": 0.0891, + "step": 2635 + }, + { + "epoch": 0.6960648605892822, + "grad_norm": 0.0639057606458664, + "learning_rate": 0.00013046576065443993, + "loss": 0.0072, + "step": 2640 + }, + { + "epoch": 0.6973831652494892, + "grad_norm": 0.027350090444087982, + "learning_rate": 0.00013033381712626994, + "loss": 0.0103, + "step": 2645 + }, + { + "epoch": 0.6987014699096962, + "grad_norm": 0.015336195938289165, + "learning_rate": 0.0001302018735981, + "loss": 0.0041, + "step": 2650 + }, + { + "epoch": 0.700019774569903, + "grad_norm": 1.0650830268859863, + "learning_rate": 0.00013006993006993008, + "loss": 0.0443, + "step": 2655 + }, + { + "epoch": 0.70133807923011, + "grad_norm": 0.019073212519288063, + "learning_rate": 0.00012993798654176012, + "loss": 0.0331, + "step": 2660 + }, + { + "epoch": 0.702656383890317, + "grad_norm": 0.10109209269285202, + "learning_rate": 0.0001298060430135902, + "loss": 0.0054, + "step": 2665 + }, + { + "epoch": 0.703974688550524, + "grad_norm": 0.03528957813978195, + "learning_rate": 0.00012967409948542023, + "loss": 0.0427, + "step": 2670 + }, + { + "epoch": 0.705292993210731, + "grad_norm": 0.03577788919210434, + "learning_rate": 0.0001295421559572503, + "loss": 0.023, + "step": 2675 + }, + { + "epoch": 0.706611297870938, + "grad_norm": 0.5576180815696716, + "learning_rate": 0.00012941021242908034, + "loss": 0.0416, + "step": 2680 + }, + { + "epoch": 0.7079296025311449, + "grad_norm": 0.017131298780441284, + "learning_rate": 0.0001292782689009104, + "loss": 0.0235, + "step": 2685 + }, + { + "epoch": 0.7092479071913519, + "grad_norm": 0.8517888784408569, + "learning_rate": 0.00012914632537274048, + "loss": 0.0168, + "step": 2690 + }, + { + "epoch": 0.7105662118515589, + "grad_norm": 0.23812156915664673, + "learning_rate": 0.00012901438184457052, + "loss": 0.0483, + "step": 2695 + }, + { + "epoch": 0.7118845165117659, + "grad_norm": 0.11746613681316376, + "learning_rate": 0.0001288824383164006, + "loss": 0.0255, + "step": 2700 + }, + { + "epoch": 0.7132028211719729, + "grad_norm": 0.20089928805828094, + "learning_rate": 0.00012875049478823064, + "loss": 0.0267, + "step": 2705 + }, + { + "epoch": 0.7145211258321799, + "grad_norm": 0.8301129937171936, + "learning_rate": 0.0001286185512600607, + "loss": 0.016, + "step": 2710 + }, + { + "epoch": 0.7158394304923867, + "grad_norm": 0.01838674768805504, + "learning_rate": 0.00012848660773189077, + "loss": 0.0229, + "step": 2715 + }, + { + "epoch": 0.7171577351525937, + "grad_norm": 0.03670337051153183, + "learning_rate": 0.00012835466420372082, + "loss": 0.038, + "step": 2720 + }, + { + "epoch": 0.7184760398128007, + "grad_norm": 0.0452633760869503, + "learning_rate": 0.00012822272067555089, + "loss": 0.0622, + "step": 2725 + }, + { + "epoch": 0.7197943444730077, + "grad_norm": 0.09503110498189926, + "learning_rate": 0.00012809077714738093, + "loss": 0.0209, + "step": 2730 + }, + { + "epoch": 0.7211126491332147, + "grad_norm": 1.0327308177947998, + "learning_rate": 0.000127958833619211, + "loss": 0.0361, + "step": 2735 + }, + { + "epoch": 0.7224309537934217, + "grad_norm": 1.0049290657043457, + "learning_rate": 0.00012782689009104104, + "loss": 0.0365, + "step": 2740 + }, + { + "epoch": 0.7237492584536286, + "grad_norm": 0.029774073511362076, + "learning_rate": 0.00012769494656287108, + "loss": 0.0257, + "step": 2745 + }, + { + "epoch": 0.7250675631138356, + "grad_norm": 0.20974040031433105, + "learning_rate": 0.00012756300303470115, + "loss": 0.0542, + "step": 2750 + }, + { + "epoch": 0.7263858677740426, + "grad_norm": 0.8153854608535767, + "learning_rate": 0.0001274310595065312, + "loss": 0.0216, + "step": 2755 + }, + { + "epoch": 0.7277041724342496, + "grad_norm": 0.4393698573112488, + "learning_rate": 0.00012729911597836126, + "loss": 0.0451, + "step": 2760 + }, + { + "epoch": 0.7290224770944566, + "grad_norm": 0.06990349292755127, + "learning_rate": 0.00012716717245019133, + "loss": 0.03, + "step": 2765 + }, + { + "epoch": 0.7303407817546635, + "grad_norm": 0.32689470052719116, + "learning_rate": 0.00012703522892202137, + "loss": 0.0263, + "step": 2770 + }, + { + "epoch": 0.7316590864148704, + "grad_norm": 0.026600876823067665, + "learning_rate": 0.00012690328539385144, + "loss": 0.0404, + "step": 2775 + }, + { + "epoch": 0.7329773910750774, + "grad_norm": 0.11228257417678833, + "learning_rate": 0.00012677134186568148, + "loss": 0.0224, + "step": 2780 + }, + { + "epoch": 0.7342956957352844, + "grad_norm": 0.6469443440437317, + "learning_rate": 0.00012663939833751155, + "loss": 0.0178, + "step": 2785 + }, + { + "epoch": 0.7356140003954914, + "grad_norm": 0.020773250609636307, + "learning_rate": 0.0001265074548093416, + "loss": 0.011, + "step": 2790 + }, + { + "epoch": 0.7369323050556984, + "grad_norm": 0.7378728985786438, + "learning_rate": 0.00012637551128117167, + "loss": 0.0227, + "step": 2795 + }, + { + "epoch": 0.7382506097159054, + "grad_norm": 0.008189595304429531, + "learning_rate": 0.00012624356775300173, + "loss": 0.0892, + "step": 2800 + }, + { + "epoch": 0.7395689143761123, + "grad_norm": 0.031633853912353516, + "learning_rate": 0.00012611162422483178, + "loss": 0.0093, + "step": 2805 + }, + { + "epoch": 0.7408872190363193, + "grad_norm": 0.5078475475311279, + "learning_rate": 0.00012597968069666185, + "loss": 0.0567, + "step": 2810 + }, + { + "epoch": 0.7422055236965263, + "grad_norm": 0.21766887605190277, + "learning_rate": 0.0001258477371684919, + "loss": 0.0485, + "step": 2815 + }, + { + "epoch": 0.7435238283567333, + "grad_norm": 0.3029612898826599, + "learning_rate": 0.00012571579364032196, + "loss": 0.032, + "step": 2820 + }, + { + "epoch": 0.7448421330169402, + "grad_norm": 1.2135159969329834, + "learning_rate": 0.00012558385011215203, + "loss": 0.0139, + "step": 2825 + }, + { + "epoch": 0.7461604376771472, + "grad_norm": 0.016875172033905983, + "learning_rate": 0.00012545190658398207, + "loss": 0.0323, + "step": 2830 + }, + { + "epoch": 0.7474787423373541, + "grad_norm": 0.08923230320215225, + "learning_rate": 0.00012531996305581214, + "loss": 0.0343, + "step": 2835 + }, + { + "epoch": 0.7487970469975611, + "grad_norm": 0.2958766520023346, + "learning_rate": 0.00012518801952764215, + "loss": 0.0431, + "step": 2840 + }, + { + "epoch": 0.7501153516577681, + "grad_norm": 0.7344386577606201, + "learning_rate": 0.00012505607599947222, + "loss": 0.0389, + "step": 2845 + }, + { + "epoch": 0.7514336563179751, + "grad_norm": 0.03681635856628418, + "learning_rate": 0.0001249241324713023, + "loss": 0.0258, + "step": 2850 + }, + { + "epoch": 0.7527519609781821, + "grad_norm": 0.22866861522197723, + "learning_rate": 0.00012479218894313233, + "loss": 0.0223, + "step": 2855 + }, + { + "epoch": 0.7540702656383891, + "grad_norm": 0.029770435765385628, + "learning_rate": 0.0001246602454149624, + "loss": 0.0205, + "step": 2860 + }, + { + "epoch": 0.755388570298596, + "grad_norm": 0.011845707893371582, + "learning_rate": 0.00012452830188679244, + "loss": 0.0252, + "step": 2865 + }, + { + "epoch": 0.756706874958803, + "grad_norm": 0.06696149706840515, + "learning_rate": 0.00012439635835862251, + "loss": 0.0166, + "step": 2870 + }, + { + "epoch": 0.75802517961901, + "grad_norm": 0.01653144136071205, + "learning_rate": 0.00012426441483045256, + "loss": 0.0487, + "step": 2875 + }, + { + "epoch": 0.7593434842792169, + "grad_norm": 0.031312476843595505, + "learning_rate": 0.00012413247130228263, + "loss": 0.0155, + "step": 2880 + }, + { + "epoch": 0.7606617889394239, + "grad_norm": 0.011625733226537704, + "learning_rate": 0.0001240005277741127, + "loss": 0.0333, + "step": 2885 + }, + { + "epoch": 0.7619800935996309, + "grad_norm": 0.012089414522051811, + "learning_rate": 0.00012386858424594274, + "loss": 0.003, + "step": 2890 + }, + { + "epoch": 0.7632983982598378, + "grad_norm": 0.3012307584285736, + "learning_rate": 0.0001237366407177728, + "loss": 0.0172, + "step": 2895 + }, + { + "epoch": 0.7646167029200448, + "grad_norm": 0.31575000286102295, + "learning_rate": 0.00012360469718960285, + "loss": 0.0409, + "step": 2900 + }, + { + "epoch": 0.7659350075802518, + "grad_norm": 0.009794364683330059, + "learning_rate": 0.00012347275366143292, + "loss": 0.0214, + "step": 2905 + }, + { + "epoch": 0.7672533122404588, + "grad_norm": 0.5973085165023804, + "learning_rate": 0.00012334081013326299, + "loss": 0.0245, + "step": 2910 + }, + { + "epoch": 0.7685716169006658, + "grad_norm": 0.019750040024518967, + "learning_rate": 0.00012320886660509303, + "loss": 0.0063, + "step": 2915 + }, + { + "epoch": 0.7698899215608728, + "grad_norm": 0.06402858346700668, + "learning_rate": 0.0001230769230769231, + "loss": 0.0444, + "step": 2920 + }, + { + "epoch": 0.7712082262210797, + "grad_norm": 0.02876671403646469, + "learning_rate": 0.00012294497954875314, + "loss": 0.0103, + "step": 2925 + }, + { + "epoch": 0.7725265308812866, + "grad_norm": 0.6962207555770874, + "learning_rate": 0.0001228130360205832, + "loss": 0.0318, + "step": 2930 + }, + { + "epoch": 0.7738448355414936, + "grad_norm": 0.006536522414535284, + "learning_rate": 0.00012268109249241325, + "loss": 0.0096, + "step": 2935 + }, + { + "epoch": 0.7751631402017006, + "grad_norm": 0.07097168266773224, + "learning_rate": 0.0001225491489642433, + "loss": 0.0174, + "step": 2940 + }, + { + "epoch": 0.7764814448619076, + "grad_norm": 0.042360126972198486, + "learning_rate": 0.00012241720543607336, + "loss": 0.0158, + "step": 2945 + }, + { + "epoch": 0.7777997495221146, + "grad_norm": 0.01159572321921587, + "learning_rate": 0.0001222852619079034, + "loss": 0.0265, + "step": 2950 + }, + { + "epoch": 0.7791180541823215, + "grad_norm": 0.38408163189888, + "learning_rate": 0.00012215331837973347, + "loss": 0.0233, + "step": 2955 + }, + { + "epoch": 0.7804363588425285, + "grad_norm": 0.15588605403900146, + "learning_rate": 0.00012202137485156353, + "loss": 0.0041, + "step": 2960 + }, + { + "epoch": 0.7817546635027355, + "grad_norm": 0.006892362609505653, + "learning_rate": 0.00012188943132339358, + "loss": 0.0026, + "step": 2965 + }, + { + "epoch": 0.7830729681629425, + "grad_norm": 0.030915727838873863, + "learning_rate": 0.00012175748779522364, + "loss": 0.0028, + "step": 2970 + }, + { + "epoch": 0.7843912728231495, + "grad_norm": 0.8151025772094727, + "learning_rate": 0.00012162554426705371, + "loss": 0.0429, + "step": 2975 + }, + { + "epoch": 0.7857095774833565, + "grad_norm": 0.6765475273132324, + "learning_rate": 0.00012149360073888377, + "loss": 0.0319, + "step": 2980 + }, + { + "epoch": 0.7870278821435633, + "grad_norm": 0.054469238966703415, + "learning_rate": 0.00012136165721071382, + "loss": 0.0413, + "step": 2985 + }, + { + "epoch": 0.7883461868037703, + "grad_norm": 0.045610666275024414, + "learning_rate": 0.00012122971368254388, + "loss": 0.0521, + "step": 2990 + }, + { + "epoch": 0.7896644914639773, + "grad_norm": 0.4222470223903656, + "learning_rate": 0.00012109777015437393, + "loss": 0.0846, + "step": 2995 + }, + { + "epoch": 0.7909827961241843, + "grad_norm": 0.0272397268563509, + "learning_rate": 0.00012096582662620399, + "loss": 0.0364, + "step": 3000 + }, + { + "epoch": 0.7909827961241843, + "eval_loss": 0.033312585204839706, + "eval_runtime": 452.2552, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 3.728, + "step": 3000 + }, + { + "epoch": 0.7923011007843913, + "grad_norm": 0.08674059063196182, + "learning_rate": 0.00012083388309803406, + "loss": 0.0081, + "step": 3005 + }, + { + "epoch": 0.7936194054445982, + "grad_norm": 0.21960832178592682, + "learning_rate": 0.00012070193956986411, + "loss": 0.0468, + "step": 3010 + }, + { + "epoch": 0.7949377101048052, + "grad_norm": 0.11259289085865021, + "learning_rate": 0.00012056999604169417, + "loss": 0.0124, + "step": 3015 + }, + { + "epoch": 0.7962560147650122, + "grad_norm": 0.02945362776517868, + "learning_rate": 0.00012043805251352422, + "loss": 0.0298, + "step": 3020 + }, + { + "epoch": 0.7975743194252192, + "grad_norm": 0.27889615297317505, + "learning_rate": 0.00012030610898535428, + "loss": 0.0251, + "step": 3025 + }, + { + "epoch": 0.7988926240854262, + "grad_norm": 0.05873241275548935, + "learning_rate": 0.00012017416545718434, + "loss": 0.0132, + "step": 3030 + }, + { + "epoch": 0.8002109287456332, + "grad_norm": 0.1570046991109848, + "learning_rate": 0.00012004222192901439, + "loss": 0.0228, + "step": 3035 + }, + { + "epoch": 0.80152923340584, + "grad_norm": 0.12575332820415497, + "learning_rate": 0.00011991027840084443, + "loss": 0.0049, + "step": 3040 + }, + { + "epoch": 0.802847538066047, + "grad_norm": 0.8416435122489929, + "learning_rate": 0.00011977833487267449, + "loss": 0.0542, + "step": 3045 + }, + { + "epoch": 0.804165842726254, + "grad_norm": 0.2605098485946655, + "learning_rate": 0.00011964639134450454, + "loss": 0.0084, + "step": 3050 + }, + { + "epoch": 0.805484147386461, + "grad_norm": 0.8996294736862183, + "learning_rate": 0.00011951444781633461, + "loss": 0.0442, + "step": 3055 + }, + { + "epoch": 0.806802452046668, + "grad_norm": 2.7525105476379395, + "learning_rate": 0.00011938250428816467, + "loss": 0.0642, + "step": 3060 + }, + { + "epoch": 0.808120756706875, + "grad_norm": 0.14955930411815643, + "learning_rate": 0.00011925056075999473, + "loss": 0.0384, + "step": 3065 + }, + { + "epoch": 0.8094390613670819, + "grad_norm": 0.018756115809082985, + "learning_rate": 0.00011911861723182478, + "loss": 0.0154, + "step": 3070 + }, + { + "epoch": 0.8107573660272889, + "grad_norm": 0.23998615145683289, + "learning_rate": 0.00011898667370365484, + "loss": 0.0413, + "step": 3075 + }, + { + "epoch": 0.8120756706874959, + "grad_norm": 0.27253249287605286, + "learning_rate": 0.00011885473017548489, + "loss": 0.0081, + "step": 3080 + }, + { + "epoch": 0.8133939753477029, + "grad_norm": 0.2925993502140045, + "learning_rate": 0.00011872278664731495, + "loss": 0.0332, + "step": 3085 + }, + { + "epoch": 0.8147122800079099, + "grad_norm": 0.5364832878112793, + "learning_rate": 0.00011859084311914502, + "loss": 0.0143, + "step": 3090 + }, + { + "epoch": 0.8160305846681168, + "grad_norm": 0.32104921340942383, + "learning_rate": 0.00011845889959097507, + "loss": 0.0216, + "step": 3095 + }, + { + "epoch": 0.8173488893283237, + "grad_norm": 0.0205856766551733, + "learning_rate": 0.00011832695606280513, + "loss": 0.0346, + "step": 3100 + }, + { + "epoch": 0.8186671939885307, + "grad_norm": 0.2541547417640686, + "learning_rate": 0.00011819501253463518, + "loss": 0.0793, + "step": 3105 + }, + { + "epoch": 0.8199854986487377, + "grad_norm": 0.08333491533994675, + "learning_rate": 0.00011806306900646524, + "loss": 0.0049, + "step": 3110 + }, + { + "epoch": 0.8213038033089447, + "grad_norm": 0.0355968177318573, + "learning_rate": 0.0001179311254782953, + "loss": 0.0051, + "step": 3115 + }, + { + "epoch": 0.8226221079691517, + "grad_norm": 0.06948401033878326, + "learning_rate": 0.00011779918195012536, + "loss": 0.013, + "step": 3120 + }, + { + "epoch": 0.8239404126293587, + "grad_norm": 0.03328891843557358, + "learning_rate": 0.00011766723842195542, + "loss": 0.0122, + "step": 3125 + }, + { + "epoch": 0.8252587172895656, + "grad_norm": 0.013782350346446037, + "learning_rate": 0.00011753529489378548, + "loss": 0.0073, + "step": 3130 + }, + { + "epoch": 0.8265770219497726, + "grad_norm": 0.024390392005443573, + "learning_rate": 0.00011740335136561553, + "loss": 0.0143, + "step": 3135 + }, + { + "epoch": 0.8278953266099796, + "grad_norm": 0.002548128366470337, + "learning_rate": 0.00011727140783744557, + "loss": 0.0027, + "step": 3140 + }, + { + "epoch": 0.8292136312701865, + "grad_norm": 0.11674848943948746, + "learning_rate": 0.00011713946430927563, + "loss": 0.0253, + "step": 3145 + }, + { + "epoch": 0.8305319359303935, + "grad_norm": 0.005774884019047022, + "learning_rate": 0.00011700752078110568, + "loss": 0.0018, + "step": 3150 + }, + { + "epoch": 0.8318502405906005, + "grad_norm": 0.5763069987297058, + "learning_rate": 0.00011687557725293574, + "loss": 0.0119, + "step": 3155 + }, + { + "epoch": 0.8331685452508074, + "grad_norm": 0.0027607593219727278, + "learning_rate": 0.0001167436337247658, + "loss": 0.0279, + "step": 3160 + }, + { + "epoch": 0.8344868499110144, + "grad_norm": 1.859642505645752, + "learning_rate": 0.00011661169019659585, + "loss": 0.0228, + "step": 3165 + }, + { + "epoch": 0.8358051545712214, + "grad_norm": 0.16597022116184235, + "learning_rate": 0.00011647974666842592, + "loss": 0.1228, + "step": 3170 + }, + { + "epoch": 0.8371234592314284, + "grad_norm": 0.33833742141723633, + "learning_rate": 0.00011634780314025598, + "loss": 0.073, + "step": 3175 + }, + { + "epoch": 0.8384417638916354, + "grad_norm": 0.024682912975549698, + "learning_rate": 0.00011621585961208603, + "loss": 0.0042, + "step": 3180 + }, + { + "epoch": 0.8397600685518424, + "grad_norm": 0.05926942452788353, + "learning_rate": 0.00011608391608391609, + "loss": 0.0066, + "step": 3185 + }, + { + "epoch": 0.8410783732120493, + "grad_norm": 0.1414029747247696, + "learning_rate": 0.00011595197255574614, + "loss": 0.0603, + "step": 3190 + }, + { + "epoch": 0.8423966778722562, + "grad_norm": 0.37928736209869385, + "learning_rate": 0.0001158200290275762, + "loss": 0.0266, + "step": 3195 + }, + { + "epoch": 0.8437149825324632, + "grad_norm": 0.018329354003071785, + "learning_rate": 0.00011568808549940627, + "loss": 0.0047, + "step": 3200 + }, + { + "epoch": 0.8450332871926702, + "grad_norm": 0.2993735373020172, + "learning_rate": 0.00011555614197123632, + "loss": 0.0218, + "step": 3205 + }, + { + "epoch": 0.8463515918528772, + "grad_norm": 0.1767728328704834, + "learning_rate": 0.00011542419844306638, + "loss": 0.0363, + "step": 3210 + }, + { + "epoch": 0.8476698965130842, + "grad_norm": 0.39774414896965027, + "learning_rate": 0.00011529225491489644, + "loss": 0.0506, + "step": 3215 + }, + { + "epoch": 0.8489882011732911, + "grad_norm": 0.021896762773394585, + "learning_rate": 0.00011516031138672649, + "loss": 0.0081, + "step": 3220 + }, + { + "epoch": 0.8503065058334981, + "grad_norm": 0.358372300863266, + "learning_rate": 0.00011502836785855655, + "loss": 0.0224, + "step": 3225 + }, + { + "epoch": 0.8516248104937051, + "grad_norm": 0.01605542004108429, + "learning_rate": 0.00011489642433038662, + "loss": 0.0215, + "step": 3230 + }, + { + "epoch": 0.8529431151539121, + "grad_norm": 0.021189266815781593, + "learning_rate": 0.00011476448080221667, + "loss": 0.0051, + "step": 3235 + }, + { + "epoch": 0.8542614198141191, + "grad_norm": 0.013394076377153397, + "learning_rate": 0.0001146325372740467, + "loss": 0.021, + "step": 3240 + }, + { + "epoch": 0.8555797244743261, + "grad_norm": 0.19848507642745972, + "learning_rate": 0.00011450059374587676, + "loss": 0.0285, + "step": 3245 + }, + { + "epoch": 0.856898029134533, + "grad_norm": 0.2463046759366989, + "learning_rate": 0.00011436865021770683, + "loss": 0.0384, + "step": 3250 + }, + { + "epoch": 0.8582163337947399, + "grad_norm": 0.37432390451431274, + "learning_rate": 0.00011423670668953688, + "loss": 0.0098, + "step": 3255 + }, + { + "epoch": 0.8595346384549469, + "grad_norm": 0.060943394899368286, + "learning_rate": 0.00011410476316136694, + "loss": 0.0087, + "step": 3260 + }, + { + "epoch": 0.8608529431151539, + "grad_norm": 0.2846696674823761, + "learning_rate": 0.00011397281963319699, + "loss": 0.0148, + "step": 3265 + }, + { + "epoch": 0.8621712477753609, + "grad_norm": 0.009311323054134846, + "learning_rate": 0.00011384087610502705, + "loss": 0.0024, + "step": 3270 + }, + { + "epoch": 0.8634895524355679, + "grad_norm": 0.046277035027742386, + "learning_rate": 0.0001137089325768571, + "loss": 0.0274, + "step": 3275 + }, + { + "epoch": 0.8648078570957748, + "grad_norm": 0.006024620030075312, + "learning_rate": 0.00011357698904868716, + "loss": 0.0286, + "step": 3280 + }, + { + "epoch": 0.8661261617559818, + "grad_norm": 0.033578380942344666, + "learning_rate": 0.00011344504552051723, + "loss": 0.0153, + "step": 3285 + }, + { + "epoch": 0.8674444664161888, + "grad_norm": 0.8537917137145996, + "learning_rate": 0.00011331310199234728, + "loss": 0.0304, + "step": 3290 + }, + { + "epoch": 0.8687627710763958, + "grad_norm": 0.013933337293565273, + "learning_rate": 0.00011318115846417734, + "loss": 0.0112, + "step": 3295 + }, + { + "epoch": 0.8700810757366028, + "grad_norm": 0.35437721014022827, + "learning_rate": 0.0001130492149360074, + "loss": 0.0228, + "step": 3300 + }, + { + "epoch": 0.8713993803968098, + "grad_norm": 1.3024121522903442, + "learning_rate": 0.00011291727140783745, + "loss": 0.0203, + "step": 3305 + }, + { + "epoch": 0.8727176850570166, + "grad_norm": 0.5131255984306335, + "learning_rate": 0.00011278532787966751, + "loss": 0.0181, + "step": 3310 + }, + { + "epoch": 0.8740359897172236, + "grad_norm": 0.039366886019706726, + "learning_rate": 0.00011265338435149758, + "loss": 0.0192, + "step": 3315 + }, + { + "epoch": 0.8753542943774306, + "grad_norm": 0.13679669797420502, + "learning_rate": 0.00011252144082332763, + "loss": 0.004, + "step": 3320 + }, + { + "epoch": 0.8766725990376376, + "grad_norm": 0.003076886525377631, + "learning_rate": 0.00011238949729515769, + "loss": 0.0405, + "step": 3325 + }, + { + "epoch": 0.8779909036978446, + "grad_norm": 0.019953785464167595, + "learning_rate": 0.00011225755376698774, + "loss": 0.0241, + "step": 3330 + }, + { + "epoch": 0.8793092083580516, + "grad_norm": 0.007980377413332462, + "learning_rate": 0.0001121256102388178, + "loss": 0.0064, + "step": 3335 + }, + { + "epoch": 0.8806275130182585, + "grad_norm": 0.018761295825242996, + "learning_rate": 0.00011199366671064784, + "loss": 0.0032, + "step": 3340 + }, + { + "epoch": 0.8819458176784655, + "grad_norm": 0.022511709481477737, + "learning_rate": 0.0001118617231824779, + "loss": 0.0055, + "step": 3345 + }, + { + "epoch": 0.8832641223386725, + "grad_norm": 0.021270718425512314, + "learning_rate": 0.00011172977965430795, + "loss": 0.033, + "step": 3350 + }, + { + "epoch": 0.8845824269988795, + "grad_norm": 0.02710561640560627, + "learning_rate": 0.00011159783612613801, + "loss": 0.0094, + "step": 3355 + }, + { + "epoch": 0.8859007316590864, + "grad_norm": 0.4353378117084503, + "learning_rate": 0.00011146589259796806, + "loss": 0.0089, + "step": 3360 + }, + { + "epoch": 0.8872190363192934, + "grad_norm": 0.0257766991853714, + "learning_rate": 0.00011133394906979813, + "loss": 0.0059, + "step": 3365 + }, + { + "epoch": 0.8885373409795003, + "grad_norm": 0.80838942527771, + "learning_rate": 0.00011120200554162819, + "loss": 0.0263, + "step": 3370 + }, + { + "epoch": 0.8898556456397073, + "grad_norm": 0.007799761835485697, + "learning_rate": 0.00011107006201345824, + "loss": 0.0028, + "step": 3375 + }, + { + "epoch": 0.8911739502999143, + "grad_norm": 0.007315775845199823, + "learning_rate": 0.0001109381184852883, + "loss": 0.0127, + "step": 3380 + }, + { + "epoch": 0.8924922549601213, + "grad_norm": 1.4861233234405518, + "learning_rate": 0.00011080617495711836, + "loss": 0.0562, + "step": 3385 + }, + { + "epoch": 0.8938105596203283, + "grad_norm": 0.010219530202448368, + "learning_rate": 0.00011067423142894841, + "loss": 0.0438, + "step": 3390 + }, + { + "epoch": 0.8951288642805353, + "grad_norm": 1.0191857814788818, + "learning_rate": 0.00011054228790077848, + "loss": 0.0493, + "step": 3395 + }, + { + "epoch": 0.8964471689407422, + "grad_norm": 0.01459536887705326, + "learning_rate": 0.00011041034437260854, + "loss": 0.0117, + "step": 3400 + }, + { + "epoch": 0.8977654736009492, + "grad_norm": 0.008682495914399624, + "learning_rate": 0.00011027840084443859, + "loss": 0.02, + "step": 3405 + }, + { + "epoch": 0.8990837782611562, + "grad_norm": 0.02197263017296791, + "learning_rate": 0.00011014645731626865, + "loss": 0.0454, + "step": 3410 + }, + { + "epoch": 0.9004020829213631, + "grad_norm": 0.01436714269220829, + "learning_rate": 0.0001100145137880987, + "loss": 0.0283, + "step": 3415 + }, + { + "epoch": 0.9017203875815701, + "grad_norm": 0.14327946305274963, + "learning_rate": 0.00010988257025992876, + "loss": 0.0461, + "step": 3420 + }, + { + "epoch": 0.9030386922417771, + "grad_norm": 1.671773910522461, + "learning_rate": 0.00010975062673175883, + "loss": 0.054, + "step": 3425 + }, + { + "epoch": 0.904356996901984, + "grad_norm": 0.009926804341375828, + "learning_rate": 0.00010961868320358888, + "loss": 0.0429, + "step": 3430 + }, + { + "epoch": 0.905675301562191, + "grad_norm": 0.554020881652832, + "learning_rate": 0.00010948673967541894, + "loss": 0.0618, + "step": 3435 + }, + { + "epoch": 0.906993606222398, + "grad_norm": 0.1399248093366623, + "learning_rate": 0.00010935479614724897, + "loss": 0.0229, + "step": 3440 + }, + { + "epoch": 0.908311910882605, + "grad_norm": 0.02739197015762329, + "learning_rate": 0.00010922285261907904, + "loss": 0.0082, + "step": 3445 + }, + { + "epoch": 0.909630215542812, + "grad_norm": 0.33394527435302734, + "learning_rate": 0.00010909090909090909, + "loss": 0.0403, + "step": 3450 + }, + { + "epoch": 0.9109485202030189, + "grad_norm": 0.08083894103765488, + "learning_rate": 0.00010895896556273915, + "loss": 0.0406, + "step": 3455 + }, + { + "epoch": 0.9122668248632259, + "grad_norm": 0.39336663484573364, + "learning_rate": 0.0001088270220345692, + "loss": 0.02, + "step": 3460 + }, + { + "epoch": 0.9135851295234328, + "grad_norm": 0.20481553673744202, + "learning_rate": 0.00010869507850639926, + "loss": 0.0221, + "step": 3465 + }, + { + "epoch": 0.9149034341836398, + "grad_norm": 1.4507408142089844, + "learning_rate": 0.00010856313497822932, + "loss": 0.0357, + "step": 3470 + }, + { + "epoch": 0.9162217388438468, + "grad_norm": 0.2678806483745575, + "learning_rate": 0.00010843119145005937, + "loss": 0.0181, + "step": 3475 + }, + { + "epoch": 0.9175400435040538, + "grad_norm": 0.007361674215644598, + "learning_rate": 0.00010829924792188944, + "loss": 0.0978, + "step": 3480 + }, + { + "epoch": 0.9188583481642607, + "grad_norm": 0.773695707321167, + "learning_rate": 0.0001081673043937195, + "loss": 0.0401, + "step": 3485 + }, + { + "epoch": 0.9201766528244677, + "grad_norm": 0.0010772625682875514, + "learning_rate": 0.00010803536086554955, + "loss": 0.0233, + "step": 3490 + }, + { + "epoch": 0.9214949574846747, + "grad_norm": 0.08971104770898819, + "learning_rate": 0.00010790341733737961, + "loss": 0.0319, + "step": 3495 + }, + { + "epoch": 0.9228132621448817, + "grad_norm": 0.21372731029987335, + "learning_rate": 0.00010777147380920966, + "loss": 0.0315, + "step": 3500 + }, + { + "epoch": 0.9228132621448817, + "eval_loss": 0.02952708676457405, + "eval_runtime": 451.5837, + "eval_samples_per_second": 7.467, + "eval_steps_per_second": 3.734, + "step": 3500 + }, + { + "epoch": 0.9241315668050887, + "grad_norm": 0.016639264300465584, + "learning_rate": 0.00010763953028103972, + "loss": 0.0125, + "step": 3505 + }, + { + "epoch": 0.9254498714652957, + "grad_norm": 0.46340492367744446, + "learning_rate": 0.00010750758675286979, + "loss": 0.0186, + "step": 3510 + }, + { + "epoch": 0.9267681761255026, + "grad_norm": 0.01847526989877224, + "learning_rate": 0.00010737564322469984, + "loss": 0.0026, + "step": 3515 + }, + { + "epoch": 0.9280864807857095, + "grad_norm": 0.5947860479354858, + "learning_rate": 0.0001072436996965299, + "loss": 0.0259, + "step": 3520 + }, + { + "epoch": 0.9294047854459165, + "grad_norm": 0.06145291402935982, + "learning_rate": 0.00010711175616835995, + "loss": 0.0057, + "step": 3525 + }, + { + "epoch": 0.9307230901061235, + "grad_norm": 0.0143959429115057, + "learning_rate": 0.00010697981264019001, + "loss": 0.0145, + "step": 3530 + }, + { + "epoch": 0.9320413947663305, + "grad_norm": 0.21143831312656403, + "learning_rate": 0.00010684786911202007, + "loss": 0.0459, + "step": 3535 + }, + { + "epoch": 0.9333596994265375, + "grad_norm": 0.02548077143728733, + "learning_rate": 0.00010671592558385011, + "loss": 0.0051, + "step": 3540 + }, + { + "epoch": 0.9346780040867444, + "grad_norm": 0.008077048696577549, + "learning_rate": 0.00010658398205568016, + "loss": 0.0306, + "step": 3545 + }, + { + "epoch": 0.9359963087469514, + "grad_norm": 0.0030760422814637423, + "learning_rate": 0.00010645203852751022, + "loss": 0.0575, + "step": 3550 + }, + { + "epoch": 0.9373146134071584, + "grad_norm": 0.18114158511161804, + "learning_rate": 0.00010632009499934027, + "loss": 0.0885, + "step": 3555 + }, + { + "epoch": 0.9386329180673654, + "grad_norm": 0.02450549602508545, + "learning_rate": 0.00010618815147117034, + "loss": 0.0045, + "step": 3560 + }, + { + "epoch": 0.9399512227275724, + "grad_norm": 0.1238626018166542, + "learning_rate": 0.0001060562079430004, + "loss": 0.0166, + "step": 3565 + }, + { + "epoch": 0.9412695273877794, + "grad_norm": 0.1879919469356537, + "learning_rate": 0.00010592426441483046, + "loss": 0.0077, + "step": 3570 + }, + { + "epoch": 0.9425878320479862, + "grad_norm": 0.11323565989732742, + "learning_rate": 0.00010579232088666051, + "loss": 0.0213, + "step": 3575 + }, + { + "epoch": 0.9439061367081932, + "grad_norm": 0.35575854778289795, + "learning_rate": 0.00010566037735849057, + "loss": 0.0336, + "step": 3580 + }, + { + "epoch": 0.9452244413684002, + "grad_norm": 0.14052227139472961, + "learning_rate": 0.00010552843383032062, + "loss": 0.0325, + "step": 3585 + }, + { + "epoch": 0.9465427460286072, + "grad_norm": 0.2643798887729645, + "learning_rate": 0.00010539649030215069, + "loss": 0.0192, + "step": 3590 + }, + { + "epoch": 0.9478610506888142, + "grad_norm": 0.3207031190395355, + "learning_rate": 0.00010526454677398075, + "loss": 0.0221, + "step": 3595 + }, + { + "epoch": 0.9491793553490212, + "grad_norm": 0.022803861647844315, + "learning_rate": 0.0001051326032458108, + "loss": 0.029, + "step": 3600 + }, + { + "epoch": 0.9504976600092281, + "grad_norm": 0.02511664852499962, + "learning_rate": 0.00010500065971764086, + "loss": 0.0422, + "step": 3605 + }, + { + "epoch": 0.9518159646694351, + "grad_norm": 0.06505445390939713, + "learning_rate": 0.00010486871618947091, + "loss": 0.0092, + "step": 3610 + }, + { + "epoch": 0.9531342693296421, + "grad_norm": 0.09998584538698196, + "learning_rate": 0.00010473677266130097, + "loss": 0.0242, + "step": 3615 + }, + { + "epoch": 0.9544525739898491, + "grad_norm": 0.9645698666572571, + "learning_rate": 0.00010460482913313104, + "loss": 0.0124, + "step": 3620 + }, + { + "epoch": 0.955770878650056, + "grad_norm": 0.2389964610338211, + "learning_rate": 0.0001044728856049611, + "loss": 0.0169, + "step": 3625 + }, + { + "epoch": 0.957089183310263, + "grad_norm": 2.030608654022217, + "learning_rate": 0.00010434094207679115, + "loss": 0.0518, + "step": 3630 + }, + { + "epoch": 0.9584074879704699, + "grad_norm": 0.05979987606406212, + "learning_rate": 0.0001042089985486212, + "loss": 0.0081, + "step": 3635 + }, + { + "epoch": 0.9597257926306769, + "grad_norm": 0.15761719644069672, + "learning_rate": 0.00010407705502045125, + "loss": 0.0061, + "step": 3640 + }, + { + "epoch": 0.9610440972908839, + "grad_norm": 0.6534290909767151, + "learning_rate": 0.0001039451114922813, + "loss": 0.0104, + "step": 3645 + }, + { + "epoch": 0.9623624019510909, + "grad_norm": 1.0324147939682007, + "learning_rate": 0.00010381316796411136, + "loss": 0.0381, + "step": 3650 + }, + { + "epoch": 0.9636807066112979, + "grad_norm": 0.002968872431665659, + "learning_rate": 0.00010368122443594142, + "loss": 0.0343, + "step": 3655 + }, + { + "epoch": 0.9649990112715049, + "grad_norm": 0.011243184097111225, + "learning_rate": 0.00010354928090777147, + "loss": 0.019, + "step": 3660 + }, + { + "epoch": 0.9663173159317118, + "grad_norm": 0.17663739621639252, + "learning_rate": 0.00010341733737960153, + "loss": 0.0452, + "step": 3665 + }, + { + "epoch": 0.9676356205919188, + "grad_norm": 1.2647719383239746, + "learning_rate": 0.00010328539385143158, + "loss": 0.0154, + "step": 3670 + }, + { + "epoch": 0.9689539252521258, + "grad_norm": 0.3691752552986145, + "learning_rate": 0.00010315345032326165, + "loss": 0.028, + "step": 3675 + }, + { + "epoch": 0.9702722299123328, + "grad_norm": 0.0015879774000495672, + "learning_rate": 0.00010302150679509171, + "loss": 0.0202, + "step": 3680 + }, + { + "epoch": 0.9715905345725397, + "grad_norm": 0.1441984623670578, + "learning_rate": 0.00010288956326692176, + "loss": 0.0221, + "step": 3685 + }, + { + "epoch": 0.9729088392327467, + "grad_norm": 0.20431455969810486, + "learning_rate": 0.00010275761973875182, + "loss": 0.0072, + "step": 3690 + }, + { + "epoch": 0.9742271438929536, + "grad_norm": 0.861625611782074, + "learning_rate": 0.00010262567621058187, + "loss": 0.0523, + "step": 3695 + }, + { + "epoch": 0.9755454485531606, + "grad_norm": 0.005049478262662888, + "learning_rate": 0.00010249373268241193, + "loss": 0.0051, + "step": 3700 + }, + { + "epoch": 0.9768637532133676, + "grad_norm": 0.49685510993003845, + "learning_rate": 0.000102361789154242, + "loss": 0.023, + "step": 3705 + }, + { + "epoch": 0.9781820578735746, + "grad_norm": 0.08789395540952682, + "learning_rate": 0.00010222984562607205, + "loss": 0.0159, + "step": 3710 + }, + { + "epoch": 0.9795003625337816, + "grad_norm": 0.027168691158294678, + "learning_rate": 0.00010209790209790211, + "loss": 0.0083, + "step": 3715 + }, + { + "epoch": 0.9808186671939886, + "grad_norm": 0.0006773864733986557, + "learning_rate": 0.00010196595856973217, + "loss": 0.0048, + "step": 3720 + }, + { + "epoch": 0.9821369718541955, + "grad_norm": 0.01636457070708275, + "learning_rate": 0.00010183401504156222, + "loss": 0.0159, + "step": 3725 + }, + { + "epoch": 0.9834552765144025, + "grad_norm": 0.10160859674215317, + "learning_rate": 0.00010170207151339228, + "loss": 0.0047, + "step": 3730 + }, + { + "epoch": 0.9847735811746094, + "grad_norm": 0.14173269271850586, + "learning_rate": 0.00010157012798522232, + "loss": 0.006, + "step": 3735 + }, + { + "epoch": 0.9860918858348164, + "grad_norm": 0.003458512481302023, + "learning_rate": 0.00010143818445705238, + "loss": 0.0193, + "step": 3740 + }, + { + "epoch": 0.9874101904950234, + "grad_norm": 0.005163820460438728, + "learning_rate": 0.00010130624092888243, + "loss": 0.0039, + "step": 3745 + }, + { + "epoch": 0.9887284951552304, + "grad_norm": 0.005913791712373495, + "learning_rate": 0.00010117429740071249, + "loss": 0.0119, + "step": 3750 + }, + { + "epoch": 0.9900467998154373, + "grad_norm": 0.00800853967666626, + "learning_rate": 0.00010104235387254256, + "loss": 0.044, + "step": 3755 + }, + { + "epoch": 0.9913651044756443, + "grad_norm": 0.18146778643131256, + "learning_rate": 0.00010091041034437261, + "loss": 0.0048, + "step": 3760 + }, + { + "epoch": 0.9926834091358513, + "grad_norm": 0.01235104724764824, + "learning_rate": 0.00010077846681620267, + "loss": 0.0017, + "step": 3765 + }, + { + "epoch": 0.9940017137960583, + "grad_norm": 0.17677897214889526, + "learning_rate": 0.00010064652328803272, + "loss": 0.0339, + "step": 3770 + }, + { + "epoch": 0.9953200184562653, + "grad_norm": 0.0017472271574661136, + "learning_rate": 0.00010051457975986278, + "loss": 0.0494, + "step": 3775 + }, + { + "epoch": 0.9966383231164723, + "grad_norm": 0.10814860463142395, + "learning_rate": 0.00010038263623169283, + "loss": 0.0741, + "step": 3780 + }, + { + "epoch": 0.9979566277766792, + "grad_norm": 0.11329760402441025, + "learning_rate": 0.0001002506927035229, + "loss": 0.0182, + "step": 3785 + }, + { + "epoch": 0.9992749324368861, + "grad_norm": 0.11573276668787003, + "learning_rate": 0.00010011874917535296, + "loss": 0.0068, + "step": 3790 + }, + { + "epoch": 1.000790982796124, + "grad_norm": 0.08449886739253998, + "learning_rate": 9.998680564718301e-05, + "loss": 0.0141, + "step": 3795 + }, + { + "epoch": 1.002109287456331, + "grad_norm": 0.05035184696316719, + "learning_rate": 9.985486211901307e-05, + "loss": 0.0293, + "step": 3800 + }, + { + "epoch": 1.003427592116538, + "grad_norm": 0.0255444198846817, + "learning_rate": 9.972291859084313e-05, + "loss": 0.0054, + "step": 3805 + }, + { + "epoch": 1.004745896776745, + "grad_norm": 0.0033677336759865284, + "learning_rate": 9.959097506267318e-05, + "loss": 0.0567, + "step": 3810 + }, + { + "epoch": 1.006064201436952, + "grad_norm": 0.09453682601451874, + "learning_rate": 9.945903153450324e-05, + "loss": 0.0589, + "step": 3815 + }, + { + "epoch": 1.007382506097159, + "grad_norm": 0.01592979207634926, + "learning_rate": 9.932708800633329e-05, + "loss": 0.0043, + "step": 3820 + }, + { + "epoch": 1.008700810757366, + "grad_norm": 0.002263693604618311, + "learning_rate": 9.919514447816335e-05, + "loss": 0.0195, + "step": 3825 + }, + { + "epoch": 1.010019115417573, + "grad_norm": 0.013390793465077877, + "learning_rate": 9.90632009499934e-05, + "loss": 0.0152, + "step": 3830 + }, + { + "epoch": 1.01133742007778, + "grad_norm": 0.10473847389221191, + "learning_rate": 9.893125742182346e-05, + "loss": 0.0606, + "step": 3835 + }, + { + "epoch": 1.012655724737987, + "grad_norm": 0.05837221071124077, + "learning_rate": 9.879931389365353e-05, + "loss": 0.0121, + "step": 3840 + }, + { + "epoch": 1.013974029398194, + "grad_norm": 0.3803791105747223, + "learning_rate": 9.866737036548358e-05, + "loss": 0.0386, + "step": 3845 + }, + { + "epoch": 1.0152923340584008, + "grad_norm": 0.4067519009113312, + "learning_rate": 9.853542683731364e-05, + "loss": 0.0115, + "step": 3850 + }, + { + "epoch": 1.0166106387186078, + "grad_norm": 0.02585229091346264, + "learning_rate": 9.84034833091437e-05, + "loss": 0.0214, + "step": 3855 + }, + { + "epoch": 1.0179289433788148, + "grad_norm": 0.03670825809240341, + "learning_rate": 9.827153978097374e-05, + "loss": 0.0059, + "step": 3860 + }, + { + "epoch": 1.0192472480390218, + "grad_norm": 0.014171554706990719, + "learning_rate": 9.81395962528038e-05, + "loss": 0.0145, + "step": 3865 + }, + { + "epoch": 1.0205655526992288, + "grad_norm": 0.027376385405659676, + "learning_rate": 9.800765272463386e-05, + "loss": 0.0089, + "step": 3870 + }, + { + "epoch": 1.0218838573594358, + "grad_norm": 0.03168405964970589, + "learning_rate": 9.787570919646392e-05, + "loss": 0.0132, + "step": 3875 + }, + { + "epoch": 1.0232021620196428, + "grad_norm": 0.03346199914813042, + "learning_rate": 9.774376566829397e-05, + "loss": 0.0246, + "step": 3880 + }, + { + "epoch": 1.0245204666798498, + "grad_norm": 0.00894144270569086, + "learning_rate": 9.761182214012403e-05, + "loss": 0.0105, + "step": 3885 + }, + { + "epoch": 1.0258387713400567, + "grad_norm": 0.3172806203365326, + "learning_rate": 9.747987861195409e-05, + "loss": 0.0103, + "step": 3890 + }, + { + "epoch": 1.0271570760002637, + "grad_norm": 0.009055040776729584, + "learning_rate": 9.734793508378414e-05, + "loss": 0.0103, + "step": 3895 + }, + { + "epoch": 1.0284753806604707, + "grad_norm": 0.014140011742711067, + "learning_rate": 9.721599155561421e-05, + "loss": 0.0037, + "step": 3900 + }, + { + "epoch": 1.0297936853206777, + "grad_norm": 0.008317383006215096, + "learning_rate": 9.708404802744427e-05, + "loss": 0.002, + "step": 3905 + }, + { + "epoch": 1.0311119899808845, + "grad_norm": 0.005038558971136808, + "learning_rate": 9.695210449927431e-05, + "loss": 0.0017, + "step": 3910 + }, + { + "epoch": 1.0324302946410915, + "grad_norm": 0.40058520436286926, + "learning_rate": 9.682016097110436e-05, + "loss": 0.0065, + "step": 3915 + }, + { + "epoch": 1.0337485993012985, + "grad_norm": 0.005197151098400354, + "learning_rate": 9.668821744293442e-05, + "loss": 0.0031, + "step": 3920 + }, + { + "epoch": 1.0350669039615055, + "grad_norm": 0.014353781007230282, + "learning_rate": 9.655627391476449e-05, + "loss": 0.0009, + "step": 3925 + }, + { + "epoch": 1.0363852086217125, + "grad_norm": 0.13260559737682343, + "learning_rate": 9.642433038659454e-05, + "loss": 0.0323, + "step": 3930 + }, + { + "epoch": 1.0377035132819195, + "grad_norm": 0.006795065477490425, + "learning_rate": 9.62923868584246e-05, + "loss": 0.0022, + "step": 3935 + }, + { + "epoch": 1.0390218179421264, + "grad_norm": 0.2276086062192917, + "learning_rate": 9.616044333025466e-05, + "loss": 0.0221, + "step": 3940 + }, + { + "epoch": 1.0403401226023334, + "grad_norm": 0.06121920794248581, + "learning_rate": 9.602849980208471e-05, + "loss": 0.0037, + "step": 3945 + }, + { + "epoch": 1.0416584272625404, + "grad_norm": 0.9180755019187927, + "learning_rate": 9.589655627391477e-05, + "loss": 0.0589, + "step": 3950 + }, + { + "epoch": 1.0429767319227474, + "grad_norm": 0.07515591382980347, + "learning_rate": 9.576461274574484e-05, + "loss": 0.0653, + "step": 3955 + }, + { + "epoch": 1.0442950365829544, + "grad_norm": 0.018060607835650444, + "learning_rate": 9.563266921757488e-05, + "loss": 0.0178, + "step": 3960 + }, + { + "epoch": 1.0456133412431612, + "grad_norm": 0.02751368284225464, + "learning_rate": 9.550072568940493e-05, + "loss": 0.0076, + "step": 3965 + }, + { + "epoch": 1.0469316459033682, + "grad_norm": 0.653998613357544, + "learning_rate": 9.536878216123499e-05, + "loss": 0.0066, + "step": 3970 + }, + { + "epoch": 1.0482499505635752, + "grad_norm": 0.3117768168449402, + "learning_rate": 9.523683863306505e-05, + "loss": 0.0087, + "step": 3975 + }, + { + "epoch": 1.0495682552237822, + "grad_norm": 0.013952831737697124, + "learning_rate": 9.510489510489511e-05, + "loss": 0.0037, + "step": 3980 + }, + { + "epoch": 1.0508865598839892, + "grad_norm": 0.01806250400841236, + "learning_rate": 9.497295157672517e-05, + "loss": 0.0028, + "step": 3985 + }, + { + "epoch": 1.0522048645441962, + "grad_norm": 0.13678006827831268, + "learning_rate": 9.484100804855523e-05, + "loss": 0.0533, + "step": 3990 + }, + { + "epoch": 1.0535231692044031, + "grad_norm": 0.14869382977485657, + "learning_rate": 9.470906452038528e-05, + "loss": 0.009, + "step": 3995 + }, + { + "epoch": 1.0548414738646101, + "grad_norm": 0.33614659309387207, + "learning_rate": 9.457712099221534e-05, + "loss": 0.0555, + "step": 4000 + }, + { + "epoch": 1.0548414738646101, + "eval_loss": 0.026165226474404335, + "eval_runtime": 452.2482, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 3.728, + "step": 4000 + }, + { + "epoch": 1.0561597785248171, + "grad_norm": 0.007546027656644583, + "learning_rate": 9.444517746404539e-05, + "loss": 0.0029, + "step": 4005 + }, + { + "epoch": 1.0574780831850241, + "grad_norm": 0.3720332384109497, + "learning_rate": 9.431323393587545e-05, + "loss": 0.0353, + "step": 4010 + }, + { + "epoch": 1.0587963878452311, + "grad_norm": 1.1335264444351196, + "learning_rate": 9.41812904077055e-05, + "loss": 0.0142, + "step": 4015 + }, + { + "epoch": 1.060114692505438, + "grad_norm": 0.024723488837480545, + "learning_rate": 9.404934687953556e-05, + "loss": 0.006, + "step": 4020 + }, + { + "epoch": 1.0614329971656449, + "grad_norm": 0.040354058146476746, + "learning_rate": 9.391740335136562e-05, + "loss": 0.0107, + "step": 4025 + }, + { + "epoch": 1.0627513018258519, + "grad_norm": 0.222810298204422, + "learning_rate": 9.378545982319567e-05, + "loss": 0.0273, + "step": 4030 + }, + { + "epoch": 1.0640696064860589, + "grad_norm": 0.025684095919132233, + "learning_rate": 9.365351629502574e-05, + "loss": 0.0033, + "step": 4035 + }, + { + "epoch": 1.0653879111462659, + "grad_norm": 0.05338352546095848, + "learning_rate": 9.35215727668558e-05, + "loss": 0.0052, + "step": 4040 + }, + { + "epoch": 1.0667062158064728, + "grad_norm": 0.06182330474257469, + "learning_rate": 9.338962923868585e-05, + "loss": 0.0038, + "step": 4045 + }, + { + "epoch": 1.0680245204666798, + "grad_norm": 0.012170832604169846, + "learning_rate": 9.325768571051591e-05, + "loss": 0.0018, + "step": 4050 + }, + { + "epoch": 1.0693428251268868, + "grad_norm": 0.5424306392669678, + "learning_rate": 9.312574218234596e-05, + "loss": 0.0445, + "step": 4055 + }, + { + "epoch": 1.0706611297870938, + "grad_norm": 0.017939254641532898, + "learning_rate": 9.299379865417602e-05, + "loss": 0.0389, + "step": 4060 + }, + { + "epoch": 1.0719794344473008, + "grad_norm": 0.0060431682504713535, + "learning_rate": 9.286185512600607e-05, + "loss": 0.0025, + "step": 4065 + }, + { + "epoch": 1.0732977391075078, + "grad_norm": 0.0071444883942604065, + "learning_rate": 9.272991159783613e-05, + "loss": 0.0333, + "step": 4070 + }, + { + "epoch": 1.0746160437677148, + "grad_norm": 0.29632750153541565, + "learning_rate": 9.259796806966619e-05, + "loss": 0.0151, + "step": 4075 + }, + { + "epoch": 1.0759343484279218, + "grad_norm": 0.004526323173195124, + "learning_rate": 9.246602454149624e-05, + "loss": 0.006, + "step": 4080 + }, + { + "epoch": 1.0772526530881286, + "grad_norm": 0.023945212364196777, + "learning_rate": 9.23340810133263e-05, + "loss": 0.004, + "step": 4085 + }, + { + "epoch": 1.0785709577483356, + "grad_norm": 0.13235126435756683, + "learning_rate": 9.220213748515635e-05, + "loss": 0.0059, + "step": 4090 + }, + { + "epoch": 1.0798892624085425, + "grad_norm": 0.17592330276966095, + "learning_rate": 9.207019395698642e-05, + "loss": 0.0302, + "step": 4095 + }, + { + "epoch": 1.0812075670687495, + "grad_norm": 0.004582866560667753, + "learning_rate": 9.193825042881648e-05, + "loss": 0.009, + "step": 4100 + }, + { + "epoch": 1.0825258717289565, + "grad_norm": 0.15214525163173676, + "learning_rate": 9.180630690064653e-05, + "loss": 0.0062, + "step": 4105 + }, + { + "epoch": 1.0838441763891635, + "grad_norm": 0.16535983979701996, + "learning_rate": 9.167436337247658e-05, + "loss": 0.0926, + "step": 4110 + }, + { + "epoch": 1.0851624810493705, + "grad_norm": 0.013285227119922638, + "learning_rate": 9.154241984430663e-05, + "loss": 0.0043, + "step": 4115 + }, + { + "epoch": 1.0864807857095775, + "grad_norm": 0.012116984464228153, + "learning_rate": 9.14104763161367e-05, + "loss": 0.0037, + "step": 4120 + }, + { + "epoch": 1.0877990903697845, + "grad_norm": 0.0373845212161541, + "learning_rate": 9.127853278796676e-05, + "loss": 0.0081, + "step": 4125 + }, + { + "epoch": 1.0891173950299915, + "grad_norm": 0.09324615448713303, + "learning_rate": 9.114658925979681e-05, + "loss": 0.0534, + "step": 4130 + }, + { + "epoch": 1.0904356996901985, + "grad_norm": 0.010992968454957008, + "learning_rate": 9.101464573162687e-05, + "loss": 0.0025, + "step": 4135 + }, + { + "epoch": 1.0917540043504055, + "grad_norm": 0.13710318505764008, + "learning_rate": 9.088270220345692e-05, + "loss": 0.0555, + "step": 4140 + }, + { + "epoch": 1.0930723090106123, + "grad_norm": 0.010403074324131012, + "learning_rate": 9.075075867528698e-05, + "loss": 0.0042, + "step": 4145 + }, + { + "epoch": 1.0943906136708192, + "grad_norm": 0.21544460952281952, + "learning_rate": 9.061881514711705e-05, + "loss": 0.0144, + "step": 4150 + }, + { + "epoch": 1.0957089183310262, + "grad_norm": 0.04194799065589905, + "learning_rate": 9.04868716189471e-05, + "loss": 0.0106, + "step": 4155 + }, + { + "epoch": 1.0970272229912332, + "grad_norm": 0.029204202815890312, + "learning_rate": 9.035492809077715e-05, + "loss": 0.0085, + "step": 4160 + }, + { + "epoch": 1.0983455276514402, + "grad_norm": 0.006751026958227158, + "learning_rate": 9.02229845626072e-05, + "loss": 0.0049, + "step": 4165 + }, + { + "epoch": 1.0996638323116472, + "grad_norm": 0.008232722990214825, + "learning_rate": 9.009104103443726e-05, + "loss": 0.0172, + "step": 4170 + }, + { + "epoch": 1.1009821369718542, + "grad_norm": 0.05630079656839371, + "learning_rate": 8.995909750626733e-05, + "loss": 0.0112, + "step": 4175 + }, + { + "epoch": 1.1023004416320612, + "grad_norm": 0.0011601662263274193, + "learning_rate": 8.982715397809738e-05, + "loss": 0.0317, + "step": 4180 + }, + { + "epoch": 1.1036187462922682, + "grad_norm": 0.006554402410984039, + "learning_rate": 8.969521044992744e-05, + "loss": 0.0035, + "step": 4185 + }, + { + "epoch": 1.1049370509524752, + "grad_norm": 0.34513652324676514, + "learning_rate": 8.956326692175749e-05, + "loss": 0.0036, + "step": 4190 + }, + { + "epoch": 1.1062553556126822, + "grad_norm": 0.283669650554657, + "learning_rate": 8.943132339358755e-05, + "loss": 0.0182, + "step": 4195 + }, + { + "epoch": 1.1075736602728892, + "grad_norm": 0.5376952290534973, + "learning_rate": 8.92993798654176e-05, + "loss": 0.0293, + "step": 4200 + }, + { + "epoch": 1.108891964933096, + "grad_norm": 0.01689724065363407, + "learning_rate": 8.916743633724767e-05, + "loss": 0.0206, + "step": 4205 + }, + { + "epoch": 1.110210269593303, + "grad_norm": 0.026538770645856857, + "learning_rate": 8.903549280907772e-05, + "loss": 0.0181, + "step": 4210 + }, + { + "epoch": 1.11152857425351, + "grad_norm": 0.6372873783111572, + "learning_rate": 8.890354928090777e-05, + "loss": 0.021, + "step": 4215 + }, + { + "epoch": 1.112846878913717, + "grad_norm": 0.06177428737282753, + "learning_rate": 8.877160575273783e-05, + "loss": 0.0033, + "step": 4220 + }, + { + "epoch": 1.114165183573924, + "grad_norm": 0.3712109923362732, + "learning_rate": 8.863966222456788e-05, + "loss": 0.0075, + "step": 4225 + }, + { + "epoch": 1.115483488234131, + "grad_norm": 0.030514653772115707, + "learning_rate": 8.850771869639795e-05, + "loss": 0.0183, + "step": 4230 + }, + { + "epoch": 1.116801792894338, + "grad_norm": 0.012861707247793674, + "learning_rate": 8.837577516822801e-05, + "loss": 0.0032, + "step": 4235 + }, + { + "epoch": 1.118120097554545, + "grad_norm": 0.3278522789478302, + "learning_rate": 8.824383164005806e-05, + "loss": 0.0058, + "step": 4240 + }, + { + "epoch": 1.1194384022147519, + "grad_norm": 0.580259382724762, + "learning_rate": 8.811188811188812e-05, + "loss": 0.0068, + "step": 4245 + }, + { + "epoch": 1.1207567068749589, + "grad_norm": 0.007002575788646936, + "learning_rate": 8.797994458371817e-05, + "loss": 0.0063, + "step": 4250 + }, + { + "epoch": 1.1220750115351659, + "grad_norm": 0.22484643757343292, + "learning_rate": 8.784800105554823e-05, + "loss": 0.0167, + "step": 4255 + }, + { + "epoch": 1.1233933161953726, + "grad_norm": 0.004122686106711626, + "learning_rate": 8.771605752737829e-05, + "loss": 0.002, + "step": 4260 + }, + { + "epoch": 1.1247116208555796, + "grad_norm": 0.009832561016082764, + "learning_rate": 8.758411399920834e-05, + "loss": 0.0029, + "step": 4265 + }, + { + "epoch": 1.1260299255157866, + "grad_norm": 0.04854527860879898, + "learning_rate": 8.74521704710384e-05, + "loss": 0.0068, + "step": 4270 + }, + { + "epoch": 1.1273482301759936, + "grad_norm": 0.12221235036849976, + "learning_rate": 8.732022694286845e-05, + "loss": 0.003, + "step": 4275 + }, + { + "epoch": 1.1286665348362006, + "grad_norm": 0.005857539363205433, + "learning_rate": 8.718828341469851e-05, + "loss": 0.0022, + "step": 4280 + }, + { + "epoch": 1.1299848394964076, + "grad_norm": 0.10582758486270905, + "learning_rate": 8.705633988652856e-05, + "loss": 0.002, + "step": 4285 + }, + { + "epoch": 1.1313031441566146, + "grad_norm": 0.006190940272063017, + "learning_rate": 8.692439635835863e-05, + "loss": 0.0022, + "step": 4290 + }, + { + "epoch": 1.1326214488168216, + "grad_norm": 0.00221514655277133, + "learning_rate": 8.679245283018869e-05, + "loss": 0.0314, + "step": 4295 + }, + { + "epoch": 1.1339397534770286, + "grad_norm": 0.0796755850315094, + "learning_rate": 8.666050930201874e-05, + "loss": 0.0347, + "step": 4300 + }, + { + "epoch": 1.1352580581372356, + "grad_norm": 0.20088806748390198, + "learning_rate": 8.65285657738488e-05, + "loss": 0.0048, + "step": 4305 + }, + { + "epoch": 1.1365763627974426, + "grad_norm": 0.4018377363681793, + "learning_rate": 8.639662224567884e-05, + "loss": 0.0234, + "step": 4310 + }, + { + "epoch": 1.1378946674576496, + "grad_norm": 0.014961684122681618, + "learning_rate": 8.626467871750891e-05, + "loss": 0.0033, + "step": 4315 + }, + { + "epoch": 1.1392129721178565, + "grad_norm": 0.004534922540187836, + "learning_rate": 8.613273518933897e-05, + "loss": 0.0021, + "step": 4320 + }, + { + "epoch": 1.1405312767780633, + "grad_norm": 0.06340984255075455, + "learning_rate": 8.600079166116902e-05, + "loss": 0.0538, + "step": 4325 + }, + { + "epoch": 1.1418495814382703, + "grad_norm": 0.007374623324722052, + "learning_rate": 8.586884813299908e-05, + "loss": 0.0157, + "step": 4330 + }, + { + "epoch": 1.1431678860984773, + "grad_norm": 0.02313193492591381, + "learning_rate": 8.573690460482913e-05, + "loss": 0.0307, + "step": 4335 + }, + { + "epoch": 1.1444861907586843, + "grad_norm": 0.014071634039282799, + "learning_rate": 8.560496107665919e-05, + "loss": 0.0058, + "step": 4340 + }, + { + "epoch": 1.1458044954188913, + "grad_norm": 1.4664901494979858, + "learning_rate": 8.547301754848926e-05, + "loss": 0.0566, + "step": 4345 + }, + { + "epoch": 1.1471228000790983, + "grad_norm": 0.023680074140429497, + "learning_rate": 8.534107402031931e-05, + "loss": 0.0048, + "step": 4350 + }, + { + "epoch": 1.1484411047393053, + "grad_norm": 0.012555698864161968, + "learning_rate": 8.520913049214937e-05, + "loss": 0.0076, + "step": 4355 + }, + { + "epoch": 1.1497594093995123, + "grad_norm": 0.013624129816889763, + "learning_rate": 8.507718696397941e-05, + "loss": 0.0373, + "step": 4360 + }, + { + "epoch": 1.1510777140597193, + "grad_norm": 0.015372387133538723, + "learning_rate": 8.494524343580947e-05, + "loss": 0.0147, + "step": 4365 + }, + { + "epoch": 1.1523960187199263, + "grad_norm": 0.3312993347644806, + "learning_rate": 8.481329990763954e-05, + "loss": 0.0299, + "step": 4370 + }, + { + "epoch": 1.1537143233801332, + "grad_norm": 0.023838184773921967, + "learning_rate": 8.468135637946959e-05, + "loss": 0.0226, + "step": 4375 + }, + { + "epoch": 1.15503262804034, + "grad_norm": 0.42516952753067017, + "learning_rate": 8.454941285129965e-05, + "loss": 0.0088, + "step": 4380 + }, + { + "epoch": 1.156350932700547, + "grad_norm": 0.6900278925895691, + "learning_rate": 8.44174693231297e-05, + "loss": 0.0245, + "step": 4385 + }, + { + "epoch": 1.157669237360754, + "grad_norm": 0.2932703197002411, + "learning_rate": 8.428552579495976e-05, + "loss": 0.0207, + "step": 4390 + }, + { + "epoch": 1.158987542020961, + "grad_norm": 0.12942780554294586, + "learning_rate": 8.415358226678982e-05, + "loss": 0.0037, + "step": 4395 + }, + { + "epoch": 1.160305846681168, + "grad_norm": 0.9499046802520752, + "learning_rate": 8.402163873861989e-05, + "loss": 0.0246, + "step": 4400 + }, + { + "epoch": 1.161624151341375, + "grad_norm": 0.008869118988513947, + "learning_rate": 8.388969521044994e-05, + "loss": 0.0171, + "step": 4405 + }, + { + "epoch": 1.162942456001582, + "grad_norm": 1.7409231662750244, + "learning_rate": 8.375775168227998e-05, + "loss": 0.017, + "step": 4410 + }, + { + "epoch": 1.164260760661789, + "grad_norm": 0.0020101398695260286, + "learning_rate": 8.362580815411004e-05, + "loss": 0.0027, + "step": 4415 + }, + { + "epoch": 1.165579065321996, + "grad_norm": 0.0785067081451416, + "learning_rate": 8.34938646259401e-05, + "loss": 0.0043, + "step": 4420 + }, + { + "epoch": 1.166897369982203, + "grad_norm": 0.0029506285209208727, + "learning_rate": 8.336192109777016e-05, + "loss": 0.0109, + "step": 4425 + }, + { + "epoch": 1.16821567464241, + "grad_norm": 0.02216683328151703, + "learning_rate": 8.322997756960022e-05, + "loss": 0.0026, + "step": 4430 + }, + { + "epoch": 1.1695339793026167, + "grad_norm": 0.02216639369726181, + "learning_rate": 8.309803404143027e-05, + "loss": 0.0045, + "step": 4435 + }, + { + "epoch": 1.170852283962824, + "grad_norm": 0.0, + "learning_rate": 8.296609051326033e-05, + "loss": 0.006, + "step": 4440 + }, + { + "epoch": 1.1721705886230307, + "grad_norm": 0.0019736960530281067, + "learning_rate": 8.283414698509039e-05, + "loss": 0.0078, + "step": 4445 + }, + { + "epoch": 1.1734888932832377, + "grad_norm": 0.012957746163010597, + "learning_rate": 8.270220345692044e-05, + "loss": 0.002, + "step": 4450 + }, + { + "epoch": 1.1748071979434447, + "grad_norm": 0.010877869091928005, + "learning_rate": 8.25702599287505e-05, + "loss": 0.0237, + "step": 4455 + }, + { + "epoch": 1.1761255026036517, + "grad_norm": 0.005947659723460674, + "learning_rate": 8.243831640058055e-05, + "loss": 0.0341, + "step": 4460 + }, + { + "epoch": 1.1774438072638587, + "grad_norm": 0.0005026470171287656, + "learning_rate": 8.230637287241061e-05, + "loss": 0.0033, + "step": 4465 + }, + { + "epoch": 1.1787621119240657, + "grad_norm": 0.022054588422179222, + "learning_rate": 8.217442934424066e-05, + "loss": 0.0042, + "step": 4470 + }, + { + "epoch": 1.1800804165842727, + "grad_norm": 0.7929030656814575, + "learning_rate": 8.204248581607072e-05, + "loss": 0.0076, + "step": 4475 + }, + { + "epoch": 1.1813987212444796, + "grad_norm": 0.39052629470825195, + "learning_rate": 8.191054228790078e-05, + "loss": 0.0228, + "step": 4480 + }, + { + "epoch": 1.1827170259046866, + "grad_norm": 0.007177622988820076, + "learning_rate": 8.177859875973084e-05, + "loss": 0.01, + "step": 4485 + }, + { + "epoch": 1.1840353305648936, + "grad_norm": 0.006175135262310505, + "learning_rate": 8.16466552315609e-05, + "loss": 0.0037, + "step": 4490 + }, + { + "epoch": 1.1853536352251006, + "grad_norm": 0.0356481671333313, + "learning_rate": 8.151471170339096e-05, + "loss": 0.0024, + "step": 4495 + }, + { + "epoch": 1.1866719398853074, + "grad_norm": 0.19069480895996094, + "learning_rate": 8.138276817522101e-05, + "loss": 0.0048, + "step": 4500 + }, + { + "epoch": 1.1866719398853074, + "eval_loss": 0.026386437937617302, + "eval_runtime": 452.2896, + "eval_samples_per_second": 7.455, + "eval_steps_per_second": 3.728, + "step": 4500 + }, + { + "epoch": 1.1879902445455144, + "grad_norm": 0.002254961524158716, + "learning_rate": 8.125082464705107e-05, + "loss": 0.0014, + "step": 4505 + }, + { + "epoch": 1.1893085492057214, + "grad_norm": 0.8026870489120483, + "learning_rate": 8.111888111888112e-05, + "loss": 0.0411, + "step": 4510 + }, + { + "epoch": 1.1906268538659284, + "grad_norm": 0.47328072786331177, + "learning_rate": 8.098693759071118e-05, + "loss": 0.0271, + "step": 4515 + }, + { + "epoch": 1.1919451585261354, + "grad_norm": 0.4888288676738739, + "learning_rate": 8.085499406254123e-05, + "loss": 0.039, + "step": 4520 + }, + { + "epoch": 1.1932634631863424, + "grad_norm": 0.000925812462810427, + "learning_rate": 8.072305053437129e-05, + "loss": 0.0461, + "step": 4525 + }, + { + "epoch": 1.1945817678465493, + "grad_norm": 0.12472371757030487, + "learning_rate": 8.059110700620135e-05, + "loss": 0.0037, + "step": 4530 + }, + { + "epoch": 1.1959000725067563, + "grad_norm": 0.002875336678698659, + "learning_rate": 8.04591634780314e-05, + "loss": 0.0425, + "step": 4535 + }, + { + "epoch": 1.1972183771669633, + "grad_norm": 0.042056187987327576, + "learning_rate": 8.032721994986147e-05, + "loss": 0.0068, + "step": 4540 + }, + { + "epoch": 1.1985366818271703, + "grad_norm": 0.157605841755867, + "learning_rate": 8.019527642169153e-05, + "loss": 0.0179, + "step": 4545 + }, + { + "epoch": 1.1998549864873773, + "grad_norm": 0.005153563339263201, + "learning_rate": 8.006333289352158e-05, + "loss": 0.0045, + "step": 4550 + }, + { + "epoch": 1.201173291147584, + "grad_norm": 0.02541598491370678, + "learning_rate": 7.993138936535164e-05, + "loss": 0.0041, + "step": 4555 + }, + { + "epoch": 1.2024915958077913, + "grad_norm": 0.04266195371747017, + "learning_rate": 7.979944583718168e-05, + "loss": 0.0121, + "step": 4560 + }, + { + "epoch": 1.203809900467998, + "grad_norm": 0.36108532547950745, + "learning_rate": 7.966750230901175e-05, + "loss": 0.0147, + "step": 4565 + }, + { + "epoch": 1.205128205128205, + "grad_norm": 0.40405452251434326, + "learning_rate": 7.95355587808418e-05, + "loss": 0.0056, + "step": 4570 + }, + { + "epoch": 1.206446509788412, + "grad_norm": 0.030422702431678772, + "learning_rate": 7.940361525267186e-05, + "loss": 0.0055, + "step": 4575 + }, + { + "epoch": 1.207764814448619, + "grad_norm": 0.014555396512150764, + "learning_rate": 7.927167172450192e-05, + "loss": 0.0029, + "step": 4580 + }, + { + "epoch": 1.209083119108826, + "grad_norm": 0.33962950110435486, + "learning_rate": 7.913972819633197e-05, + "loss": 0.0191, + "step": 4585 + }, + { + "epoch": 1.210401423769033, + "grad_norm": 0.040150560438632965, + "learning_rate": 7.900778466816203e-05, + "loss": 0.0096, + "step": 4590 + }, + { + "epoch": 1.21171972842924, + "grad_norm": 0.2968510091304779, + "learning_rate": 7.88758411399921e-05, + "loss": 0.0311, + "step": 4595 + }, + { + "epoch": 1.213038033089447, + "grad_norm": 0.04709814116358757, + "learning_rate": 7.874389761182215e-05, + "loss": 0.0175, + "step": 4600 + }, + { + "epoch": 1.214356337749654, + "grad_norm": 0.1379537284374237, + "learning_rate": 7.861195408365221e-05, + "loss": 0.02, + "step": 4605 + }, + { + "epoch": 1.215674642409861, + "grad_norm": 0.018291711807250977, + "learning_rate": 7.848001055548225e-05, + "loss": 0.003, + "step": 4610 + }, + { + "epoch": 1.216992947070068, + "grad_norm": 0.041676126420497894, + "learning_rate": 7.83480670273123e-05, + "loss": 0.0054, + "step": 4615 + }, + { + "epoch": 1.2183112517302748, + "grad_norm": 0.0013747498160228133, + "learning_rate": 7.821612349914237e-05, + "loss": 0.0132, + "step": 4620 + }, + { + "epoch": 1.2196295563904818, + "grad_norm": 0.0050489697605371475, + "learning_rate": 7.808417997097243e-05, + "loss": 0.0272, + "step": 4625 + }, + { + "epoch": 1.2209478610506888, + "grad_norm": 0.017974581569433212, + "learning_rate": 7.795223644280249e-05, + "loss": 0.0037, + "step": 4630 + }, + { + "epoch": 1.2222661657108957, + "grad_norm": 0.001916698063723743, + "learning_rate": 7.782029291463254e-05, + "loss": 0.002, + "step": 4635 + }, + { + "epoch": 1.2235844703711027, + "grad_norm": 0.05344574153423309, + "learning_rate": 7.76883493864626e-05, + "loss": 0.0114, + "step": 4640 + }, + { + "epoch": 1.2249027750313097, + "grad_norm": 0.22823786735534668, + "learning_rate": 7.755640585829265e-05, + "loss": 0.0296, + "step": 4645 + }, + { + "epoch": 1.2262210796915167, + "grad_norm": 0.02051074244081974, + "learning_rate": 7.742446233012272e-05, + "loss": 0.0037, + "step": 4650 + }, + { + "epoch": 1.2275393843517237, + "grad_norm": 0.9797061681747437, + "learning_rate": 7.729251880195276e-05, + "loss": 0.011, + "step": 4655 + }, + { + "epoch": 1.2288576890119307, + "grad_norm": 0.0017285927897319198, + "learning_rate": 7.716057527378282e-05, + "loss": 0.0224, + "step": 4660 + }, + { + "epoch": 1.2301759936721377, + "grad_norm": 0.021783018484711647, + "learning_rate": 7.702863174561288e-05, + "loss": 0.0174, + "step": 4665 + }, + { + "epoch": 1.2314942983323447, + "grad_norm": 0.00763307698071003, + "learning_rate": 7.689668821744293e-05, + "loss": 0.0516, + "step": 4670 + }, + { + "epoch": 1.2328126029925515, + "grad_norm": 0.32605209946632385, + "learning_rate": 7.676474468927299e-05, + "loss": 0.0301, + "step": 4675 + }, + { + "epoch": 1.2341309076527585, + "grad_norm": 1.2027722597122192, + "learning_rate": 7.663280116110306e-05, + "loss": 0.0474, + "step": 4680 + }, + { + "epoch": 1.2354492123129655, + "grad_norm": 0.10201717168092728, + "learning_rate": 7.650085763293311e-05, + "loss": 0.0144, + "step": 4685 + }, + { + "epoch": 1.2367675169731724, + "grad_norm": 0.013835664838552475, + "learning_rate": 7.636891410476317e-05, + "loss": 0.0024, + "step": 4690 + }, + { + "epoch": 1.2380858216333794, + "grad_norm": 0.005699916277080774, + "learning_rate": 7.623697057659322e-05, + "loss": 0.0089, + "step": 4695 + }, + { + "epoch": 1.2394041262935864, + "grad_norm": 0.16583332419395447, + "learning_rate": 7.610502704842328e-05, + "loss": 0.019, + "step": 4700 + }, + { + "epoch": 1.2407224309537934, + "grad_norm": 0.2734023332595825, + "learning_rate": 7.597308352025333e-05, + "loss": 0.0041, + "step": 4705 + }, + { + "epoch": 1.2420407356140004, + "grad_norm": 0.04209504276514053, + "learning_rate": 7.584113999208339e-05, + "loss": 0.0292, + "step": 4710 + }, + { + "epoch": 1.2433590402742074, + "grad_norm": 0.0303195733577013, + "learning_rate": 7.570919646391345e-05, + "loss": 0.0019, + "step": 4715 + }, + { + "epoch": 1.2446773449344144, + "grad_norm": 0.014011899940669537, + "learning_rate": 7.55772529357435e-05, + "loss": 0.0236, + "step": 4720 + }, + { + "epoch": 1.2459956495946214, + "grad_norm": 0.37838876247406006, + "learning_rate": 7.544530940757356e-05, + "loss": 0.0081, + "step": 4725 + }, + { + "epoch": 1.2473139542548284, + "grad_norm": 0.003717717481777072, + "learning_rate": 7.531336587940361e-05, + "loss": 0.0036, + "step": 4730 + }, + { + "epoch": 1.2486322589150354, + "grad_norm": 1.2284752130508423, + "learning_rate": 7.518142235123368e-05, + "loss": 0.0089, + "step": 4735 + }, + { + "epoch": 1.2499505635752421, + "grad_norm": 0.015356095507740974, + "learning_rate": 7.504947882306374e-05, + "loss": 0.0074, + "step": 4740 + }, + { + "epoch": 1.2512688682354491, + "grad_norm": 0.0020383282098919153, + "learning_rate": 7.49175352948938e-05, + "loss": 0.0444, + "step": 4745 + }, + { + "epoch": 1.2525871728956561, + "grad_norm": 0.006680132355540991, + "learning_rate": 7.478559176672385e-05, + "loss": 0.009, + "step": 4750 + }, + { + "epoch": 1.2539054775558631, + "grad_norm": 0.01650019735097885, + "learning_rate": 7.465364823855389e-05, + "loss": 0.0022, + "step": 4755 + }, + { + "epoch": 1.2552237822160701, + "grad_norm": 0.009536102414131165, + "learning_rate": 7.452170471038396e-05, + "loss": 0.0026, + "step": 4760 + }, + { + "epoch": 1.256542086876277, + "grad_norm": 0.04677430912852287, + "learning_rate": 7.438976118221402e-05, + "loss": 0.004, + "step": 4765 + }, + { + "epoch": 1.257860391536484, + "grad_norm": 0.007777783088386059, + "learning_rate": 7.425781765404407e-05, + "loss": 0.0112, + "step": 4770 + }, + { + "epoch": 1.259178696196691, + "grad_norm": 0.03724197298288345, + "learning_rate": 7.412587412587413e-05, + "loss": 0.0065, + "step": 4775 + }, + { + "epoch": 1.260497000856898, + "grad_norm": 0.0023958412930369377, + "learning_rate": 7.399393059770418e-05, + "loss": 0.0238, + "step": 4780 + }, + { + "epoch": 1.261815305517105, + "grad_norm": 0.0036889975890517235, + "learning_rate": 7.386198706953424e-05, + "loss": 0.0012, + "step": 4785 + }, + { + "epoch": 1.263133610177312, + "grad_norm": 0.0009220903157256544, + "learning_rate": 7.373004354136431e-05, + "loss": 0.0017, + "step": 4790 + }, + { + "epoch": 1.2644519148375188, + "grad_norm": 0.0033395602367818356, + "learning_rate": 7.359810001319436e-05, + "loss": 0.0474, + "step": 4795 + }, + { + "epoch": 1.265770219497726, + "grad_norm": 0.004093261435627937, + "learning_rate": 7.346615648502442e-05, + "loss": 0.0025, + "step": 4800 + }, + { + "epoch": 1.2670885241579328, + "grad_norm": 0.004395488649606705, + "learning_rate": 7.333421295685446e-05, + "loss": 0.0011, + "step": 4805 + }, + { + "epoch": 1.2684068288181398, + "grad_norm": 0.024034051224589348, + "learning_rate": 7.320226942868452e-05, + "loss": 0.0027, + "step": 4810 + }, + { + "epoch": 1.2697251334783468, + "grad_norm": 0.9501499533653259, + "learning_rate": 7.307032590051459e-05, + "loss": 0.0279, + "step": 4815 + }, + { + "epoch": 1.2710434381385538, + "grad_norm": 0.008805549703538418, + "learning_rate": 7.293838237234464e-05, + "loss": 0.0403, + "step": 4820 + }, + { + "epoch": 1.2723617427987608, + "grad_norm": 0.01750873774290085, + "learning_rate": 7.28064388441747e-05, + "loss": 0.0571, + "step": 4825 + }, + { + "epoch": 1.2736800474589678, + "grad_norm": 0.004490260500460863, + "learning_rate": 7.267449531600475e-05, + "loss": 0.0269, + "step": 4830 + }, + { + "epoch": 1.2749983521191748, + "grad_norm": 0.07510064542293549, + "learning_rate": 7.254255178783481e-05, + "loss": 0.0123, + "step": 4835 + }, + { + "epoch": 1.2763166567793818, + "grad_norm": 0.039783038198947906, + "learning_rate": 7.241060825966486e-05, + "loss": 0.0137, + "step": 4840 + }, + { + "epoch": 1.2776349614395888, + "grad_norm": 0.019004900008440018, + "learning_rate": 7.227866473149493e-05, + "loss": 0.0047, + "step": 4845 + }, + { + "epoch": 1.2789532660997955, + "grad_norm": 0.04813052713871002, + "learning_rate": 7.214672120332499e-05, + "loss": 0.0021, + "step": 4850 + }, + { + "epoch": 1.2802715707600028, + "grad_norm": 0.00835048221051693, + "learning_rate": 7.201477767515503e-05, + "loss": 0.0014, + "step": 4855 + }, + { + "epoch": 1.2815898754202095, + "grad_norm": 0.008609198965132236, + "learning_rate": 7.188283414698509e-05, + "loss": 0.0219, + "step": 4860 + }, + { + "epoch": 1.2829081800804165, + "grad_norm": 0.007337458431720734, + "learning_rate": 7.175089061881514e-05, + "loss": 0.0014, + "step": 4865 + }, + { + "epoch": 1.2842264847406235, + "grad_norm": 0.0032645913306623697, + "learning_rate": 7.161894709064521e-05, + "loss": 0.0026, + "step": 4870 + }, + { + "epoch": 1.2855447894008305, + "grad_norm": 0.27384671568870544, + "learning_rate": 7.148700356247527e-05, + "loss": 0.0227, + "step": 4875 + }, + { + "epoch": 1.2868630940610375, + "grad_norm": 0.03584875538945198, + "learning_rate": 7.135506003430532e-05, + "loss": 0.0299, + "step": 4880 + }, + { + "epoch": 1.2881813987212445, + "grad_norm": 0.03482440486550331, + "learning_rate": 7.122311650613538e-05, + "loss": 0.0125, + "step": 4885 + }, + { + "epoch": 1.2894997033814515, + "grad_norm": 0.005974395200610161, + "learning_rate": 7.109117297796543e-05, + "loss": 0.0029, + "step": 4890 + }, + { + "epoch": 1.2908180080416585, + "grad_norm": 0.01820153370499611, + "learning_rate": 7.095922944979549e-05, + "loss": 0.0254, + "step": 4895 + }, + { + "epoch": 1.2921363127018655, + "grad_norm": 0.1733965277671814, + "learning_rate": 7.082728592162555e-05, + "loss": 0.028, + "step": 4900 + }, + { + "epoch": 1.2934546173620725, + "grad_norm": 1.3017303943634033, + "learning_rate": 7.06953423934556e-05, + "loss": 0.0213, + "step": 4905 + }, + { + "epoch": 1.2947729220222794, + "grad_norm": 0.01360877975821495, + "learning_rate": 7.056339886528566e-05, + "loss": 0.0039, + "step": 4910 + }, + { + "epoch": 1.2960912266824862, + "grad_norm": 0.01503999624401331, + "learning_rate": 7.043145533711571e-05, + "loss": 0.0102, + "step": 4915 + }, + { + "epoch": 1.2974095313426934, + "grad_norm": 0.2200804352760315, + "learning_rate": 7.029951180894577e-05, + "loss": 0.0461, + "step": 4920 + }, + { + "epoch": 1.2987278360029002, + "grad_norm": 0.08512946963310242, + "learning_rate": 7.016756828077582e-05, + "loss": 0.0066, + "step": 4925 + }, + { + "epoch": 1.3000461406631072, + "grad_norm": 0.08296570926904678, + "learning_rate": 7.00356247526059e-05, + "loss": 0.0223, + "step": 4930 + }, + { + "epoch": 1.3013644453233142, + "grad_norm": 0.008866079151630402, + "learning_rate": 6.990368122443595e-05, + "loss": 0.0032, + "step": 4935 + }, + { + "epoch": 1.3026827499835212, + "grad_norm": 0.024493014439940453, + "learning_rate": 6.9771737696266e-05, + "loss": 0.0128, + "step": 4940 + }, + { + "epoch": 1.3040010546437282, + "grad_norm": 0.08965341746807098, + "learning_rate": 6.963979416809606e-05, + "loss": 0.028, + "step": 4945 + }, + { + "epoch": 1.3053193593039352, + "grad_norm": 0.023156631737947464, + "learning_rate": 6.950785063992612e-05, + "loss": 0.0187, + "step": 4950 + }, + { + "epoch": 1.3066376639641422, + "grad_norm": 0.18552155792713165, + "learning_rate": 6.937590711175617e-05, + "loss": 0.0424, + "step": 4955 + }, + { + "epoch": 1.3079559686243492, + "grad_norm": 0.02200198918581009, + "learning_rate": 6.924396358358623e-05, + "loss": 0.0148, + "step": 4960 + }, + { + "epoch": 1.3092742732845561, + "grad_norm": 0.00568364467471838, + "learning_rate": 6.911202005541628e-05, + "loss": 0.0199, + "step": 4965 + }, + { + "epoch": 1.310592577944763, + "grad_norm": 0.021591177210211754, + "learning_rate": 6.898007652724634e-05, + "loss": 0.0092, + "step": 4970 + }, + { + "epoch": 1.3119108826049701, + "grad_norm": 0.327177494764328, + "learning_rate": 6.88481329990764e-05, + "loss": 0.0047, + "step": 4975 + }, + { + "epoch": 1.313229187265177, + "grad_norm": 0.024512887001037598, + "learning_rate": 6.871618947090645e-05, + "loss": 0.0046, + "step": 4980 + }, + { + "epoch": 1.314547491925384, + "grad_norm": 0.05725006014108658, + "learning_rate": 6.858424594273652e-05, + "loss": 0.0227, + "step": 4985 + }, + { + "epoch": 1.3158657965855909, + "grad_norm": 0.011280277743935585, + "learning_rate": 6.845230241456658e-05, + "loss": 0.0056, + "step": 4990 + }, + { + "epoch": 1.3171841012457979, + "grad_norm": 0.022504402324557304, + "learning_rate": 6.832035888639663e-05, + "loss": 0.0029, + "step": 4995 + }, + { + "epoch": 1.3185024059060049, + "grad_norm": 0.02168826013803482, + "learning_rate": 6.818841535822669e-05, + "loss": 0.0198, + "step": 5000 + }, + { + "epoch": 1.3185024059060049, + "eval_loss": 0.025039294734597206, + "eval_runtime": 452.1097, + "eval_samples_per_second": 7.458, + "eval_steps_per_second": 3.729, + "step": 5000 + }, + { + "epoch": 1.3198207105662119, + "grad_norm": 0.0064329709857702255, + "learning_rate": 6.805647183005673e-05, + "loss": 0.0299, + "step": 5005 + }, + { + "epoch": 1.3211390152264189, + "grad_norm": 0.00267885928042233, + "learning_rate": 6.79245283018868e-05, + "loss": 0.0065, + "step": 5010 + }, + { + "epoch": 1.3224573198866258, + "grad_norm": 0.6842889189720154, + "learning_rate": 6.779258477371685e-05, + "loss": 0.008, + "step": 5015 + }, + { + "epoch": 1.3237756245468328, + "grad_norm": 0.002985635306686163, + "learning_rate": 6.766064124554691e-05, + "loss": 0.0119, + "step": 5020 + }, + { + "epoch": 1.3250939292070396, + "grad_norm": 0.019304940477013588, + "learning_rate": 6.752869771737696e-05, + "loss": 0.0041, + "step": 5025 + }, + { + "epoch": 1.3264122338672468, + "grad_norm": 0.011305035091936588, + "learning_rate": 6.739675418920702e-05, + "loss": 0.0031, + "step": 5030 + }, + { + "epoch": 1.3277305385274536, + "grad_norm": 0.006184784695506096, + "learning_rate": 6.726481066103708e-05, + "loss": 0.0081, + "step": 5035 + }, + { + "epoch": 1.3290488431876606, + "grad_norm": 0.0073184361681342125, + "learning_rate": 6.713286713286715e-05, + "loss": 0.0202, + "step": 5040 + }, + { + "epoch": 1.3303671478478676, + "grad_norm": 0.006566181313246489, + "learning_rate": 6.70009236046972e-05, + "loss": 0.0052, + "step": 5045 + }, + { + "epoch": 1.3316854525080746, + "grad_norm": 0.31427526473999023, + "learning_rate": 6.686898007652726e-05, + "loss": 0.017, + "step": 5050 + }, + { + "epoch": 1.3330037571682816, + "grad_norm": 0.005085447803139687, + "learning_rate": 6.67370365483573e-05, + "loss": 0.009, + "step": 5055 + }, + { + "epoch": 1.3343220618284886, + "grad_norm": 0.2745366096496582, + "learning_rate": 6.660509302018735e-05, + "loss": 0.0119, + "step": 5060 + }, + { + "epoch": 1.3356403664886956, + "grad_norm": 0.2871796786785126, + "learning_rate": 6.647314949201742e-05, + "loss": 0.0158, + "step": 5065 + }, + { + "epoch": 1.3369586711489025, + "grad_norm": 0.2774186134338379, + "learning_rate": 6.634120596384748e-05, + "loss": 0.0084, + "step": 5070 + }, + { + "epoch": 1.3382769758091095, + "grad_norm": 0.013278775848448277, + "learning_rate": 6.620926243567753e-05, + "loss": 0.0111, + "step": 5075 + }, + { + "epoch": 1.3395952804693165, + "grad_norm": 0.01614517532289028, + "learning_rate": 6.607731890750759e-05, + "loss": 0.0066, + "step": 5080 + }, + { + "epoch": 1.3409135851295235, + "grad_norm": 0.0037789656780660152, + "learning_rate": 6.594537537933765e-05, + "loss": 0.0142, + "step": 5085 + }, + { + "epoch": 1.3422318897897303, + "grad_norm": 0.03221861273050308, + "learning_rate": 6.58134318511677e-05, + "loss": 0.0155, + "step": 5090 + }, + { + "epoch": 1.3435501944499375, + "grad_norm": 0.005637989845126867, + "learning_rate": 6.568148832299776e-05, + "loss": 0.0022, + "step": 5095 + }, + { + "epoch": 1.3448684991101443, + "grad_norm": 0.0017844432732090354, + "learning_rate": 6.554954479482783e-05, + "loss": 0.0217, + "step": 5100 + }, + { + "epoch": 1.3461868037703513, + "grad_norm": 0.08099021762609482, + "learning_rate": 6.541760126665787e-05, + "loss": 0.0222, + "step": 5105 + }, + { + "epoch": 1.3475051084305583, + "grad_norm": 0.011909045279026031, + "learning_rate": 6.528565773848792e-05, + "loss": 0.0058, + "step": 5110 + }, + { + "epoch": 1.3488234130907653, + "grad_norm": 0.7332578301429749, + "learning_rate": 6.515371421031798e-05, + "loss": 0.0286, + "step": 5115 + }, + { + "epoch": 1.3501417177509722, + "grad_norm": 0.3415885865688324, + "learning_rate": 6.502177068214804e-05, + "loss": 0.1191, + "step": 5120 + }, + { + "epoch": 1.3514600224111792, + "grad_norm": 0.00904211588203907, + "learning_rate": 6.48898271539781e-05, + "loss": 0.0043, + "step": 5125 + }, + { + "epoch": 1.3527783270713862, + "grad_norm": 0.1978830248117447, + "learning_rate": 6.475788362580816e-05, + "loss": 0.0316, + "step": 5130 + }, + { + "epoch": 1.3540966317315932, + "grad_norm": 0.10229042172431946, + "learning_rate": 6.462594009763822e-05, + "loss": 0.0194, + "step": 5135 + }, + { + "epoch": 1.3554149363918002, + "grad_norm": 0.4457210600376129, + "learning_rate": 6.449399656946827e-05, + "loss": 0.0276, + "step": 5140 + }, + { + "epoch": 1.356733241052007, + "grad_norm": 0.023706572130322456, + "learning_rate": 6.436205304129833e-05, + "loss": 0.0163, + "step": 5145 + }, + { + "epoch": 1.3580515457122142, + "grad_norm": 1.166896939277649, + "learning_rate": 6.423010951312838e-05, + "loss": 0.0189, + "step": 5150 + }, + { + "epoch": 1.359369850372421, + "grad_norm": 0.0016115796752274036, + "learning_rate": 6.409816598495844e-05, + "loss": 0.0191, + "step": 5155 + }, + { + "epoch": 1.360688155032628, + "grad_norm": 0.00786682777106762, + "learning_rate": 6.39662224567885e-05, + "loss": 0.0119, + "step": 5160 + }, + { + "epoch": 1.362006459692835, + "grad_norm": 1.042732834815979, + "learning_rate": 6.383427892861855e-05, + "loss": 0.0497, + "step": 5165 + }, + { + "epoch": 1.363324764353042, + "grad_norm": 0.007983304560184479, + "learning_rate": 6.37023354004486e-05, + "loss": 0.044, + "step": 5170 + }, + { + "epoch": 1.364643069013249, + "grad_norm": 0.009767642244696617, + "learning_rate": 6.357039187227866e-05, + "loss": 0.0405, + "step": 5175 + }, + { + "epoch": 1.365961373673456, + "grad_norm": 0.03164628520607948, + "learning_rate": 6.343844834410873e-05, + "loss": 0.0138, + "step": 5180 + }, + { + "epoch": 1.367279678333663, + "grad_norm": 0.004159921780228615, + "learning_rate": 6.330650481593879e-05, + "loss": 0.0045, + "step": 5185 + }, + { + "epoch": 1.36859798299387, + "grad_norm": 0.004395391326397657, + "learning_rate": 6.317456128776884e-05, + "loss": 0.0046, + "step": 5190 + }, + { + "epoch": 1.369916287654077, + "grad_norm": 0.011886746622622013, + "learning_rate": 6.30426177595989e-05, + "loss": 0.0064, + "step": 5195 + }, + { + "epoch": 1.371234592314284, + "grad_norm": 0.2259266972541809, + "learning_rate": 6.291067423142895e-05, + "loss": 0.0076, + "step": 5200 + }, + { + "epoch": 1.372552896974491, + "grad_norm": 0.01407301053404808, + "learning_rate": 6.277873070325901e-05, + "loss": 0.0201, + "step": 5205 + }, + { + "epoch": 1.3738712016346977, + "grad_norm": 0.00911578256636858, + "learning_rate": 6.264678717508906e-05, + "loss": 0.0164, + "step": 5210 + }, + { + "epoch": 1.3751895062949049, + "grad_norm": 0.20968014001846313, + "learning_rate": 6.251484364691912e-05, + "loss": 0.0075, + "step": 5215 + }, + { + "epoch": 1.3765078109551117, + "grad_norm": 0.008801166899502277, + "learning_rate": 6.238290011874918e-05, + "loss": 0.0068, + "step": 5220 + }, + { + "epoch": 1.3778261156153186, + "grad_norm": 0.007181806955486536, + "learning_rate": 6.225095659057923e-05, + "loss": 0.0136, + "step": 5225 + }, + { + "epoch": 1.3791444202755256, + "grad_norm": 0.7527109980583191, + "learning_rate": 6.211901306240929e-05, + "loss": 0.0287, + "step": 5230 + }, + { + "epoch": 1.3804627249357326, + "grad_norm": 0.039015207439661026, + "learning_rate": 6.198706953423936e-05, + "loss": 0.0326, + "step": 5235 + }, + { + "epoch": 1.3817810295959396, + "grad_norm": 0.021076606586575508, + "learning_rate": 6.185512600606941e-05, + "loss": 0.0191, + "step": 5240 + }, + { + "epoch": 1.3830993342561466, + "grad_norm": 0.016630731523036957, + "learning_rate": 6.172318247789947e-05, + "loss": 0.0131, + "step": 5245 + }, + { + "epoch": 1.3844176389163536, + "grad_norm": 0.011133644729852676, + "learning_rate": 6.159123894972952e-05, + "loss": 0.0029, + "step": 5250 + }, + { + "epoch": 1.3857359435765606, + "grad_norm": 0.6434677243232727, + "learning_rate": 6.145929542155957e-05, + "loss": 0.0091, + "step": 5255 + }, + { + "epoch": 1.3870542482367676, + "grad_norm": 0.051020298153162, + "learning_rate": 6.132735189338964e-05, + "loss": 0.0086, + "step": 5260 + }, + { + "epoch": 1.3883725528969744, + "grad_norm": 0.016413932666182518, + "learning_rate": 6.119540836521969e-05, + "loss": 0.0061, + "step": 5265 + }, + { + "epoch": 1.3896908575571816, + "grad_norm": 0.005769540090113878, + "learning_rate": 6.106346483704975e-05, + "loss": 0.0027, + "step": 5270 + }, + { + "epoch": 1.3910091622173884, + "grad_norm": 0.06687796860933304, + "learning_rate": 6.09315213088798e-05, + "loss": 0.0423, + "step": 5275 + }, + { + "epoch": 1.3923274668775953, + "grad_norm": 0.005641553085297346, + "learning_rate": 6.079957778070986e-05, + "loss": 0.0353, + "step": 5280 + }, + { + "epoch": 1.3936457715378023, + "grad_norm": 0.04460568353533745, + "learning_rate": 6.066763425253992e-05, + "loss": 0.0041, + "step": 5285 + }, + { + "epoch": 1.3949640761980093, + "grad_norm": 0.0387534461915493, + "learning_rate": 6.0535690724369976e-05, + "loss": 0.006, + "step": 5290 + }, + { + "epoch": 1.3962823808582163, + "grad_norm": 0.010292598977684975, + "learning_rate": 6.040374719620003e-05, + "loss": 0.0038, + "step": 5295 + }, + { + "epoch": 1.3976006855184233, + "grad_norm": 0.3646155297756195, + "learning_rate": 6.0271803668030094e-05, + "loss": 0.0111, + "step": 5300 + }, + { + "epoch": 1.3989189901786303, + "grad_norm": 0.022035539150238037, + "learning_rate": 6.0139860139860136e-05, + "loss": 0.0507, + "step": 5305 + }, + { + "epoch": 1.4002372948388373, + "grad_norm": 0.003314939560368657, + "learning_rate": 6.00079166116902e-05, + "loss": 0.0132, + "step": 5310 + }, + { + "epoch": 1.4015555994990443, + "grad_norm": 0.0838267058134079, + "learning_rate": 5.9875973083520254e-05, + "loss": 0.0105, + "step": 5315 + }, + { + "epoch": 1.4028739041592513, + "grad_norm": 0.009368584491312504, + "learning_rate": 5.974402955535031e-05, + "loss": 0.0026, + "step": 5320 + }, + { + "epoch": 1.4041922088194583, + "grad_norm": 0.031248098239302635, + "learning_rate": 5.961208602718037e-05, + "loss": 0.0151, + "step": 5325 + }, + { + "epoch": 1.405510513479665, + "grad_norm": 0.06447605788707733, + "learning_rate": 5.948014249901043e-05, + "loss": 0.0219, + "step": 5330 + }, + { + "epoch": 1.4068288181398723, + "grad_norm": 0.010814374312758446, + "learning_rate": 5.9348198970840484e-05, + "loss": 0.0038, + "step": 5335 + }, + { + "epoch": 1.408147122800079, + "grad_norm": 0.6235967874526978, + "learning_rate": 5.9216255442670546e-05, + "loss": 0.0354, + "step": 5340 + }, + { + "epoch": 1.409465427460286, + "grad_norm": 0.026741521432995796, + "learning_rate": 5.90843119145006e-05, + "loss": 0.0032, + "step": 5345 + }, + { + "epoch": 1.410783732120493, + "grad_norm": 0.019413433969020844, + "learning_rate": 5.895236838633066e-05, + "loss": 0.0216, + "step": 5350 + }, + { + "epoch": 1.4121020367807, + "grad_norm": 0.0735543966293335, + "learning_rate": 5.8820424858160706e-05, + "loss": 0.0033, + "step": 5355 + }, + { + "epoch": 1.413420341440907, + "grad_norm": 0.005189546383917332, + "learning_rate": 5.868848132999076e-05, + "loss": 0.021, + "step": 5360 + }, + { + "epoch": 1.414738646101114, + "grad_norm": 0.21240335702896118, + "learning_rate": 5.8556537801820824e-05, + "loss": 0.0294, + "step": 5365 + }, + { + "epoch": 1.416056950761321, + "grad_norm": 0.010165920481085777, + "learning_rate": 5.842459427365088e-05, + "loss": 0.0021, + "step": 5370 + }, + { + "epoch": 1.417375255421528, + "grad_norm": 0.026774069294333458, + "learning_rate": 5.8292650745480936e-05, + "loss": 0.0299, + "step": 5375 + }, + { + "epoch": 1.418693560081735, + "grad_norm": 0.0019810455851256847, + "learning_rate": 5.816070721731099e-05, + "loss": 0.0029, + "step": 5380 + }, + { + "epoch": 1.4200118647419417, + "grad_norm": 0.038888879120349884, + "learning_rate": 5.8028763689141054e-05, + "loss": 0.0069, + "step": 5385 + }, + { + "epoch": 1.421330169402149, + "grad_norm": 0.016180936247110367, + "learning_rate": 5.789682016097111e-05, + "loss": 0.0032, + "step": 5390 + }, + { + "epoch": 1.4226484740623557, + "grad_norm": 0.01119404286146164, + "learning_rate": 5.7764876632801165e-05, + "loss": 0.0024, + "step": 5395 + }, + { + "epoch": 1.4239667787225627, + "grad_norm": 0.010486694052815437, + "learning_rate": 5.763293310463123e-05, + "loss": 0.0324, + "step": 5400 + }, + { + "epoch": 1.4252850833827697, + "grad_norm": 0.005453066434711218, + "learning_rate": 5.750098957646127e-05, + "loss": 0.0038, + "step": 5405 + }, + { + "epoch": 1.4266033880429767, + "grad_norm": 0.17556461691856384, + "learning_rate": 5.736904604829133e-05, + "loss": 0.0305, + "step": 5410 + }, + { + "epoch": 1.4279216927031837, + "grad_norm": 0.03074715845286846, + "learning_rate": 5.723710252012139e-05, + "loss": 0.003, + "step": 5415 + }, + { + "epoch": 1.4292399973633907, + "grad_norm": 1.7238941192626953, + "learning_rate": 5.710515899195144e-05, + "loss": 0.0254, + "step": 5420 + }, + { + "epoch": 1.4305583020235977, + "grad_norm": 0.012462320737540722, + "learning_rate": 5.6973215463781506e-05, + "loss": 0.0018, + "step": 5425 + }, + { + "epoch": 1.4318766066838047, + "grad_norm": 0.021576853469014168, + "learning_rate": 5.684127193561156e-05, + "loss": 0.0472, + "step": 5430 + }, + { + "epoch": 1.4331949113440117, + "grad_norm": 0.2862134575843811, + "learning_rate": 5.670932840744162e-05, + "loss": 0.0258, + "step": 5435 + }, + { + "epoch": 1.4345132160042184, + "grad_norm": 0.28419312834739685, + "learning_rate": 5.657738487927168e-05, + "loss": 0.0053, + "step": 5440 + }, + { + "epoch": 1.4358315206644257, + "grad_norm": 0.013650139793753624, + "learning_rate": 5.6445441351101735e-05, + "loss": 0.0126, + "step": 5445 + }, + { + "epoch": 1.4371498253246324, + "grad_norm": 0.01203097216784954, + "learning_rate": 5.631349782293179e-05, + "loss": 0.0076, + "step": 5450 + }, + { + "epoch": 1.4384681299848394, + "grad_norm": 0.0881054624915123, + "learning_rate": 5.618155429476184e-05, + "loss": 0.0178, + "step": 5455 + }, + { + "epoch": 1.4397864346450464, + "grad_norm": 0.5258516669273376, + "learning_rate": 5.6049610766591895e-05, + "loss": 0.0112, + "step": 5460 + }, + { + "epoch": 1.4411047393052534, + "grad_norm": 0.001202153041958809, + "learning_rate": 5.591766723842196e-05, + "loss": 0.0089, + "step": 5465 + }, + { + "epoch": 1.4424230439654604, + "grad_norm": 0.4498993456363678, + "learning_rate": 5.5785723710252014e-05, + "loss": 0.0252, + "step": 5470 + }, + { + "epoch": 1.4437413486256674, + "grad_norm": 0.17477644979953766, + "learning_rate": 5.565378018208207e-05, + "loss": 0.0169, + "step": 5475 + }, + { + "epoch": 1.4450596532858744, + "grad_norm": 0.019443338736891747, + "learning_rate": 5.552183665391213e-05, + "loss": 0.0019, + "step": 5480 + }, + { + "epoch": 1.4463779579460814, + "grad_norm": 0.005653039086610079, + "learning_rate": 5.538989312574219e-05, + "loss": 0.0231, + "step": 5485 + }, + { + "epoch": 1.4476962626062884, + "grad_norm": 0.01554112322628498, + "learning_rate": 5.525794959757224e-05, + "loss": 0.0167, + "step": 5490 + }, + { + "epoch": 1.4490145672664954, + "grad_norm": 0.044272180646657944, + "learning_rate": 5.5126006069402305e-05, + "loss": 0.007, + "step": 5495 + }, + { + "epoch": 1.4503328719267023, + "grad_norm": 0.014857172966003418, + "learning_rate": 5.499406254123236e-05, + "loss": 0.0045, + "step": 5500 + }, + { + "epoch": 1.4503328719267023, + "eval_loss": 0.02392147295176983, + "eval_runtime": 452.468, + "eval_samples_per_second": 7.452, + "eval_steps_per_second": 3.726, + "step": 5500 + }, + { + "epoch": 1.4516511765869091, + "grad_norm": 0.007390835788100958, + "learning_rate": 5.486211901306241e-05, + "loss": 0.0171, + "step": 5505 + }, + { + "epoch": 1.4529694812471163, + "grad_norm": 0.0050474610179662704, + "learning_rate": 5.4730175484892466e-05, + "loss": 0.004, + "step": 5510 + }, + { + "epoch": 1.454287785907323, + "grad_norm": 0.08066163957118988, + "learning_rate": 5.459823195672252e-05, + "loss": 0.0103, + "step": 5515 + }, + { + "epoch": 1.45560609056753, + "grad_norm": 0.0062376330606639385, + "learning_rate": 5.4466288428552584e-05, + "loss": 0.0066, + "step": 5520 + }, + { + "epoch": 1.456924395227737, + "grad_norm": 0.00711809890344739, + "learning_rate": 5.433434490038264e-05, + "loss": 0.003, + "step": 5525 + }, + { + "epoch": 1.458242699887944, + "grad_norm": 0.004010149277746677, + "learning_rate": 5.4202401372212695e-05, + "loss": 0.0231, + "step": 5530 + }, + { + "epoch": 1.459561004548151, + "grad_norm": 0.4791967272758484, + "learning_rate": 5.407045784404276e-05, + "loss": 0.0277, + "step": 5535 + }, + { + "epoch": 1.460879309208358, + "grad_norm": 0.03979189693927765, + "learning_rate": 5.393851431587281e-05, + "loss": 0.0033, + "step": 5540 + }, + { + "epoch": 1.462197613868565, + "grad_norm": 0.03331119939684868, + "learning_rate": 5.380657078770287e-05, + "loss": 0.0187, + "step": 5545 + }, + { + "epoch": 1.463515918528772, + "grad_norm": 0.0042802803218364716, + "learning_rate": 5.367462725953293e-05, + "loss": 0.0032, + "step": 5550 + }, + { + "epoch": 1.464834223188979, + "grad_norm": 0.05439918115735054, + "learning_rate": 5.354268373136297e-05, + "loss": 0.0043, + "step": 5555 + }, + { + "epoch": 1.4661525278491858, + "grad_norm": 0.042643506079912186, + "learning_rate": 5.3410740203193036e-05, + "loss": 0.0059, + "step": 5560 + }, + { + "epoch": 1.467470832509393, + "grad_norm": 0.023453116416931152, + "learning_rate": 5.327879667502309e-05, + "loss": 0.0043, + "step": 5565 + }, + { + "epoch": 1.4687891371695998, + "grad_norm": 0.037712760269641876, + "learning_rate": 5.314685314685315e-05, + "loss": 0.0033, + "step": 5570 + }, + { + "epoch": 1.4701074418298068, + "grad_norm": 1.0485608577728271, + "learning_rate": 5.301490961868321e-05, + "loss": 0.0489, + "step": 5575 + }, + { + "epoch": 1.4714257464900138, + "grad_norm": 0.004728829488158226, + "learning_rate": 5.2882966090513265e-05, + "loss": 0.0067, + "step": 5580 + }, + { + "epoch": 1.4727440511502208, + "grad_norm": 0.027893677353858948, + "learning_rate": 5.275102256234332e-05, + "loss": 0.0208, + "step": 5585 + }, + { + "epoch": 1.4740623558104278, + "grad_norm": 0.02256879396736622, + "learning_rate": 5.2619079034173377e-05, + "loss": 0.0036, + "step": 5590 + }, + { + "epoch": 1.4753806604706348, + "grad_norm": 0.12636558711528778, + "learning_rate": 5.248713550600344e-05, + "loss": 0.0046, + "step": 5595 + }, + { + "epoch": 1.4766989651308418, + "grad_norm": 0.000997041119262576, + "learning_rate": 5.235519197783348e-05, + "loss": 0.0101, + "step": 5600 + }, + { + "epoch": 1.4780172697910487, + "grad_norm": 0.023494020104408264, + "learning_rate": 5.2223248449663543e-05, + "loss": 0.0039, + "step": 5605 + }, + { + "epoch": 1.4793355744512557, + "grad_norm": 0.01525307446718216, + "learning_rate": 5.20913049214936e-05, + "loss": 0.021, + "step": 5610 + }, + { + "epoch": 1.4806538791114627, + "grad_norm": 0.0024215306621044874, + "learning_rate": 5.1959361393323655e-05, + "loss": 0.0017, + "step": 5615 + }, + { + "epoch": 1.4819721837716697, + "grad_norm": 1.4708061218261719, + "learning_rate": 5.182741786515372e-05, + "loss": 0.04, + "step": 5620 + }, + { + "epoch": 1.4832904884318765, + "grad_norm": 0.015033531002700329, + "learning_rate": 5.169547433698377e-05, + "loss": 0.0042, + "step": 5625 + }, + { + "epoch": 1.4846087930920837, + "grad_norm": 0.0035444959066808224, + "learning_rate": 5.156353080881383e-05, + "loss": 0.0087, + "step": 5630 + }, + { + "epoch": 1.4859270977522905, + "grad_norm": 0.010087919421494007, + "learning_rate": 5.143158728064389e-05, + "loss": 0.0158, + "step": 5635 + }, + { + "epoch": 1.4872454024124975, + "grad_norm": 0.05779251083731651, + "learning_rate": 5.129964375247395e-05, + "loss": 0.0157, + "step": 5640 + }, + { + "epoch": 1.4885637070727045, + "grad_norm": 0.14927980303764343, + "learning_rate": 5.1167700224304e-05, + "loss": 0.0257, + "step": 5645 + }, + { + "epoch": 1.4898820117329115, + "grad_norm": 0.004252352751791477, + "learning_rate": 5.103575669613405e-05, + "loss": 0.0198, + "step": 5650 + }, + { + "epoch": 1.4912003163931185, + "grad_norm": 0.0029206848703324795, + "learning_rate": 5.090381316796411e-05, + "loss": 0.0016, + "step": 5655 + }, + { + "epoch": 1.4925186210533254, + "grad_norm": 0.005047530401498079, + "learning_rate": 5.077186963979417e-05, + "loss": 0.0023, + "step": 5660 + }, + { + "epoch": 1.4938369257135324, + "grad_norm": 0.003732564626261592, + "learning_rate": 5.0639926111624225e-05, + "loss": 0.0336, + "step": 5665 + }, + { + "epoch": 1.4951552303737394, + "grad_norm": 0.3832889497280121, + "learning_rate": 5.050798258345428e-05, + "loss": 0.0476, + "step": 5670 + }, + { + "epoch": 1.4964735350339464, + "grad_norm": 0.06733009219169617, + "learning_rate": 5.037603905528434e-05, + "loss": 0.0044, + "step": 5675 + }, + { + "epoch": 1.4977918396941532, + "grad_norm": 0.008067069575190544, + "learning_rate": 5.02440955271144e-05, + "loss": 0.0035, + "step": 5680 + }, + { + "epoch": 1.4991101443543604, + "grad_norm": 0.01706300489604473, + "learning_rate": 5.0112151998944454e-05, + "loss": 0.0031, + "step": 5685 + }, + { + "epoch": 1.5004284490145672, + "grad_norm": 0.009932024404406548, + "learning_rate": 4.998020847077451e-05, + "loss": 0.0587, + "step": 5690 + }, + { + "epoch": 1.5017467536747744, + "grad_norm": 0.006488936021924019, + "learning_rate": 4.9848264942604566e-05, + "loss": 0.002, + "step": 5695 + }, + { + "epoch": 1.5030650583349812, + "grad_norm": 0.17488756775856018, + "learning_rate": 4.971632141443462e-05, + "loss": 0.0245, + "step": 5700 + }, + { + "epoch": 1.5043833629951882, + "grad_norm": 0.3327178359031677, + "learning_rate": 4.9584377886264684e-05, + "loss": 0.0404, + "step": 5705 + }, + { + "epoch": 1.5057016676553951, + "grad_norm": 0.18467263877391815, + "learning_rate": 4.945243435809474e-05, + "loss": 0.0248, + "step": 5710 + }, + { + "epoch": 1.5070199723156021, + "grad_norm": 0.020061776041984558, + "learning_rate": 4.9320490829924795e-05, + "loss": 0.0034, + "step": 5715 + }, + { + "epoch": 1.5083382769758091, + "grad_norm": 0.0005288647953420877, + "learning_rate": 4.918854730175485e-05, + "loss": 0.0076, + "step": 5720 + }, + { + "epoch": 1.5096565816360161, + "grad_norm": 0.007515576668083668, + "learning_rate": 4.9056603773584906e-05, + "loss": 0.004, + "step": 5725 + }, + { + "epoch": 1.5109748862962231, + "grad_norm": 0.05365758761763573, + "learning_rate": 4.892466024541497e-05, + "loss": 0.0222, + "step": 5730 + }, + { + "epoch": 1.51229319095643, + "grad_norm": 0.00572391040623188, + "learning_rate": 4.8792716717245025e-05, + "loss": 0.0132, + "step": 5735 + }, + { + "epoch": 1.513611495616637, + "grad_norm": 0.21178627014160156, + "learning_rate": 4.8660773189075073e-05, + "loss": 0.0417, + "step": 5740 + }, + { + "epoch": 1.5149298002768439, + "grad_norm": 0.0641486868262291, + "learning_rate": 4.8528829660905136e-05, + "loss": 0.011, + "step": 5745 + }, + { + "epoch": 1.516248104937051, + "grad_norm": 0.04451924189925194, + "learning_rate": 4.839688613273519e-05, + "loss": 0.012, + "step": 5750 + }, + { + "epoch": 1.5175664095972579, + "grad_norm": 0.019951259717345238, + "learning_rate": 4.826494260456525e-05, + "loss": 0.009, + "step": 5755 + }, + { + "epoch": 1.5188847142574649, + "grad_norm": 0.021919893100857735, + "learning_rate": 4.813299907639531e-05, + "loss": 0.0081, + "step": 5760 + }, + { + "epoch": 1.5202030189176718, + "grad_norm": 0.5730367302894592, + "learning_rate": 4.800105554822536e-05, + "loss": 0.0254, + "step": 5765 + }, + { + "epoch": 1.5215213235778788, + "grad_norm": 0.02501523122191429, + "learning_rate": 4.786911202005542e-05, + "loss": 0.0045, + "step": 5770 + }, + { + "epoch": 1.5228396282380858, + "grad_norm": 0.01574208028614521, + "learning_rate": 4.773716849188548e-05, + "loss": 0.0081, + "step": 5775 + }, + { + "epoch": 1.5241579328982928, + "grad_norm": 0.009626791812479496, + "learning_rate": 4.760522496371553e-05, + "loss": 0.0037, + "step": 5780 + }, + { + "epoch": 1.5254762375584998, + "grad_norm": 0.535539448261261, + "learning_rate": 4.747328143554559e-05, + "loss": 0.0149, + "step": 5785 + }, + { + "epoch": 1.5267945422187066, + "grad_norm": 0.004934845492243767, + "learning_rate": 4.7341337907375644e-05, + "loss": 0.0048, + "step": 5790 + }, + { + "epoch": 1.5281128468789138, + "grad_norm": 0.009070080704987049, + "learning_rate": 4.72093943792057e-05, + "loss": 0.0028, + "step": 5795 + }, + { + "epoch": 1.5294311515391206, + "grad_norm": 0.0040720063261687756, + "learning_rate": 4.707745085103576e-05, + "loss": 0.0016, + "step": 5800 + }, + { + "epoch": 1.5307494561993278, + "grad_norm": 0.45212000608444214, + "learning_rate": 4.694550732286582e-05, + "loss": 0.0111, + "step": 5805 + }, + { + "epoch": 1.5320677608595346, + "grad_norm": 0.024048497900366783, + "learning_rate": 4.681356379469587e-05, + "loss": 0.0149, + "step": 5810 + }, + { + "epoch": 1.5333860655197418, + "grad_norm": 0.11899136006832123, + "learning_rate": 4.668162026652593e-05, + "loss": 0.0034, + "step": 5815 + }, + { + "epoch": 1.5347043701799485, + "grad_norm": 0.011249657720327377, + "learning_rate": 4.6549676738355984e-05, + "loss": 0.0052, + "step": 5820 + }, + { + "epoch": 1.5360226748401555, + "grad_norm": 0.051634710282087326, + "learning_rate": 4.641773321018604e-05, + "loss": 0.0031, + "step": 5825 + }, + { + "epoch": 1.5373409795003625, + "grad_norm": 0.3726826012134552, + "learning_rate": 4.62857896820161e-05, + "loss": 0.0582, + "step": 5830 + }, + { + "epoch": 1.5386592841605695, + "grad_norm": 0.5827310681343079, + "learning_rate": 4.615384615384616e-05, + "loss": 0.0652, + "step": 5835 + }, + { + "epoch": 1.5399775888207765, + "grad_norm": 0.006390869617462158, + "learning_rate": 4.6021902625676214e-05, + "loss": 0.0022, + "step": 5840 + }, + { + "epoch": 1.5412958934809835, + "grad_norm": 0.022760871797800064, + "learning_rate": 4.588995909750627e-05, + "loss": 0.0311, + "step": 5845 + }, + { + "epoch": 1.5426141981411905, + "grad_norm": 0.22773241996765137, + "learning_rate": 4.5758015569336325e-05, + "loss": 0.0051, + "step": 5850 + }, + { + "epoch": 1.5439325028013973, + "grad_norm": 0.015375247225165367, + "learning_rate": 4.562607204116639e-05, + "loss": 0.0023, + "step": 5855 + }, + { + "epoch": 1.5452508074616045, + "grad_norm": 0.007347101345658302, + "learning_rate": 4.549412851299644e-05, + "loss": 0.0437, + "step": 5860 + }, + { + "epoch": 1.5465691121218113, + "grad_norm": 0.012344900518655777, + "learning_rate": 4.536218498482649e-05, + "loss": 0.004, + "step": 5865 + }, + { + "epoch": 1.5478874167820185, + "grad_norm": 0.27038896083831787, + "learning_rate": 4.5230241456656555e-05, + "loss": 0.0047, + "step": 5870 + }, + { + "epoch": 1.5492057214422252, + "grad_norm": 0.016395213082432747, + "learning_rate": 4.509829792848661e-05, + "loss": 0.0026, + "step": 5875 + }, + { + "epoch": 1.5505240261024322, + "grad_norm": 0.4217267632484436, + "learning_rate": 4.4966354400316666e-05, + "loss": 0.0364, + "step": 5880 + }, + { + "epoch": 1.5518423307626392, + "grad_norm": 0.20046105980873108, + "learning_rate": 4.483441087214673e-05, + "loss": 0.0243, + "step": 5885 + }, + { + "epoch": 1.5531606354228462, + "grad_norm": 0.004307698458433151, + "learning_rate": 4.470246734397678e-05, + "loss": 0.0064, + "step": 5890 + }, + { + "epoch": 1.5544789400830532, + "grad_norm": 0.46102187037467957, + "learning_rate": 4.457052381580683e-05, + "loss": 0.0115, + "step": 5895 + }, + { + "epoch": 1.5557972447432602, + "grad_norm": 0.0689118504524231, + "learning_rate": 4.4438580287636895e-05, + "loss": 0.0334, + "step": 5900 + }, + { + "epoch": 1.5571155494034672, + "grad_norm": 0.003091114340350032, + "learning_rate": 4.430663675946695e-05, + "loss": 0.0246, + "step": 5905 + }, + { + "epoch": 1.558433854063674, + "grad_norm": 0.003877349430695176, + "learning_rate": 4.417469323129701e-05, + "loss": 0.0032, + "step": 5910 + }, + { + "epoch": 1.5597521587238812, + "grad_norm": 0.30713143944740295, + "learning_rate": 4.404274970312706e-05, + "loss": 0.0229, + "step": 5915 + }, + { + "epoch": 1.561070463384088, + "grad_norm": 0.07344445586204529, + "learning_rate": 4.391080617495712e-05, + "loss": 0.0078, + "step": 5920 + }, + { + "epoch": 1.5623887680442952, + "grad_norm": 0.01774723082780838, + "learning_rate": 4.377886264678718e-05, + "loss": 0.0034, + "step": 5925 + }, + { + "epoch": 1.563707072704502, + "grad_norm": 0.476324200630188, + "learning_rate": 4.3646919118617236e-05, + "loss": 0.0071, + "step": 5930 + }, + { + "epoch": 1.5650253773647091, + "grad_norm": 0.11624465882778168, + "learning_rate": 4.351497559044729e-05, + "loss": 0.0236, + "step": 5935 + }, + { + "epoch": 1.566343682024916, + "grad_norm": 0.190691277384758, + "learning_rate": 4.338303206227735e-05, + "loss": 0.006, + "step": 5940 + }, + { + "epoch": 1.567661986685123, + "grad_norm": 0.20517045259475708, + "learning_rate": 4.32510885341074e-05, + "loss": 0.009, + "step": 5945 + }, + { + "epoch": 1.56898029134533, + "grad_norm": 0.008122317492961884, + "learning_rate": 4.311914500593746e-05, + "loss": 0.0041, + "step": 5950 + }, + { + "epoch": 1.570298596005537, + "grad_norm": 0.01982291042804718, + "learning_rate": 4.298720147776752e-05, + "loss": 0.0258, + "step": 5955 + }, + { + "epoch": 1.5716169006657439, + "grad_norm": 0.000996922142803669, + "learning_rate": 4.285525794959758e-05, + "loss": 0.0233, + "step": 5960 + }, + { + "epoch": 1.5729352053259509, + "grad_norm": 0.09725592285394669, + "learning_rate": 4.272331442142763e-05, + "loss": 0.0218, + "step": 5965 + }, + { + "epoch": 1.5742535099861579, + "grad_norm": 0.0672350749373436, + "learning_rate": 4.259137089325769e-05, + "loss": 0.0194, + "step": 5970 + }, + { + "epoch": 1.5755718146463646, + "grad_norm": 0.014844833873212337, + "learning_rate": 4.2459427365087744e-05, + "loss": 0.0298, + "step": 5975 + }, + { + "epoch": 1.5768901193065719, + "grad_norm": 0.030519040301442146, + "learning_rate": 4.2327483836917806e-05, + "loss": 0.0178, + "step": 5980 + }, + { + "epoch": 1.5782084239667786, + "grad_norm": 0.018561460077762604, + "learning_rate": 4.219554030874786e-05, + "loss": 0.0154, + "step": 5985 + }, + { + "epoch": 1.5795267286269858, + "grad_norm": 0.02470085583627224, + "learning_rate": 4.206359678057791e-05, + "loss": 0.0361, + "step": 5990 + }, + { + "epoch": 1.5808450332871926, + "grad_norm": 0.055412422865629196, + "learning_rate": 4.193165325240797e-05, + "loss": 0.0162, + "step": 5995 + }, + { + "epoch": 1.5821633379473996, + "grad_norm": 0.0034158769994974136, + "learning_rate": 4.179970972423803e-05, + "loss": 0.0068, + "step": 6000 + }, + { + "epoch": 1.5821633379473996, + "eval_loss": 0.024797894060611725, + "eval_runtime": 452.1611, + "eval_samples_per_second": 7.458, + "eval_steps_per_second": 3.729, + "step": 6000 + }, + { + "epoch": 1.5834816426076066, + "grad_norm": 0.01284120511263609, + "learning_rate": 4.1667766196068085e-05, + "loss": 0.0036, + "step": 6005 + }, + { + "epoch": 1.5847999472678136, + "grad_norm": 0.01274865586310625, + "learning_rate": 4.153582266789815e-05, + "loss": 0.0447, + "step": 6010 + }, + { + "epoch": 1.5861182519280206, + "grad_norm": 0.03555435314774513, + "learning_rate": 4.1403879139728196e-05, + "loss": 0.0078, + "step": 6015 + }, + { + "epoch": 1.5874365565882276, + "grad_norm": 0.0011938117677345872, + "learning_rate": 4.127193561155825e-05, + "loss": 0.0136, + "step": 6020 + }, + { + "epoch": 1.5887548612484346, + "grad_norm": 0.9741255640983582, + "learning_rate": 4.1139992083388314e-05, + "loss": 0.0153, + "step": 6025 + }, + { + "epoch": 1.5900731659086413, + "grad_norm": 0.011220674030482769, + "learning_rate": 4.100804855521837e-05, + "loss": 0.0262, + "step": 6030 + }, + { + "epoch": 1.5913914705688486, + "grad_norm": 0.021556466817855835, + "learning_rate": 4.0876105027048425e-05, + "loss": 0.0044, + "step": 6035 + }, + { + "epoch": 1.5927097752290553, + "grad_norm": 0.2725502848625183, + "learning_rate": 4.074416149887848e-05, + "loss": 0.0558, + "step": 6040 + }, + { + "epoch": 1.5940280798892625, + "grad_norm": 0.6407182216644287, + "learning_rate": 4.0612217970708537e-05, + "loss": 0.0261, + "step": 6045 + }, + { + "epoch": 1.5953463845494693, + "grad_norm": 0.0024960115551948547, + "learning_rate": 4.04802744425386e-05, + "loss": 0.0128, + "step": 6050 + }, + { + "epoch": 1.5966646892096763, + "grad_norm": 0.11380109190940857, + "learning_rate": 4.0348330914368655e-05, + "loss": 0.0199, + "step": 6055 + }, + { + "epoch": 1.5979829938698833, + "grad_norm": 0.18358005583286285, + "learning_rate": 4.0216387386198704e-05, + "loss": 0.0083, + "step": 6060 + }, + { + "epoch": 1.5993012985300903, + "grad_norm": 0.06412303447723389, + "learning_rate": 4.0084443858028766e-05, + "loss": 0.0548, + "step": 6065 + }, + { + "epoch": 1.6006196031902973, + "grad_norm": 0.6999421119689941, + "learning_rate": 3.995250032985882e-05, + "loss": 0.0074, + "step": 6070 + }, + { + "epoch": 1.6019379078505043, + "grad_norm": 0.18698133528232574, + "learning_rate": 3.982055680168888e-05, + "loss": 0.0542, + "step": 6075 + }, + { + "epoch": 1.6032562125107113, + "grad_norm": 0.014717207290232182, + "learning_rate": 3.968861327351894e-05, + "loss": 0.0071, + "step": 6080 + }, + { + "epoch": 1.604574517170918, + "grad_norm": 0.0765385851264, + "learning_rate": 3.955666974534899e-05, + "loss": 0.0063, + "step": 6085 + }, + { + "epoch": 1.6058928218311253, + "grad_norm": 0.4332450330257416, + "learning_rate": 3.9424726217179044e-05, + "loss": 0.0071, + "step": 6090 + }, + { + "epoch": 1.607211126491332, + "grad_norm": 0.003700035158544779, + "learning_rate": 3.929278268900911e-05, + "loss": 0.0052, + "step": 6095 + }, + { + "epoch": 1.6085294311515392, + "grad_norm": 0.02500278130173683, + "learning_rate": 3.916083916083916e-05, + "loss": 0.0387, + "step": 6100 + }, + { + "epoch": 1.609847735811746, + "grad_norm": 0.023568281903862953, + "learning_rate": 3.902889563266922e-05, + "loss": 0.0594, + "step": 6105 + }, + { + "epoch": 1.6111660404719532, + "grad_norm": 0.02687825821340084, + "learning_rate": 3.8896952104499274e-05, + "loss": 0.0229, + "step": 6110 + }, + { + "epoch": 1.61248434513216, + "grad_norm": 0.005178579594939947, + "learning_rate": 3.876500857632933e-05, + "loss": 0.0293, + "step": 6115 + }, + { + "epoch": 1.613802649792367, + "grad_norm": 0.3987988531589508, + "learning_rate": 3.863306504815939e-05, + "loss": 0.015, + "step": 6120 + }, + { + "epoch": 1.615120954452574, + "grad_norm": 0.18915466964244843, + "learning_rate": 3.850112151998945e-05, + "loss": 0.023, + "step": 6125 + }, + { + "epoch": 1.616439259112781, + "grad_norm": 0.015252528712153435, + "learning_rate": 3.83691779918195e-05, + "loss": 0.0185, + "step": 6130 + }, + { + "epoch": 1.617757563772988, + "grad_norm": 0.04947187379002571, + "learning_rate": 3.823723446364956e-05, + "loss": 0.0131, + "step": 6135 + }, + { + "epoch": 1.619075868433195, + "grad_norm": 0.017095958814024925, + "learning_rate": 3.8105290935479615e-05, + "loss": 0.0071, + "step": 6140 + }, + { + "epoch": 1.620394173093402, + "grad_norm": 0.013050337322056293, + "learning_rate": 3.797334740730967e-05, + "loss": 0.0038, + "step": 6145 + }, + { + "epoch": 1.6217124777536087, + "grad_norm": 0.08132806420326233, + "learning_rate": 3.784140387913973e-05, + "loss": 0.0043, + "step": 6150 + }, + { + "epoch": 1.623030782413816, + "grad_norm": 0.020741304382681847, + "learning_rate": 3.770946035096979e-05, + "loss": 0.006, + "step": 6155 + }, + { + "epoch": 1.6243490870740227, + "grad_norm": 0.0576217919588089, + "learning_rate": 3.7577516822799844e-05, + "loss": 0.0033, + "step": 6160 + }, + { + "epoch": 1.62566739173423, + "grad_norm": 0.03032900020480156, + "learning_rate": 3.74455732946299e-05, + "loss": 0.0318, + "step": 6165 + }, + { + "epoch": 1.6269856963944367, + "grad_norm": 0.8868799209594727, + "learning_rate": 3.7313629766459955e-05, + "loss": 0.0304, + "step": 6170 + }, + { + "epoch": 1.6283040010546437, + "grad_norm": 0.003816834883764386, + "learning_rate": 3.718168623829002e-05, + "loss": 0.003, + "step": 6175 + }, + { + "epoch": 1.6296223057148507, + "grad_norm": 0.05368296429514885, + "learning_rate": 3.704974271012007e-05, + "loss": 0.0064, + "step": 6180 + }, + { + "epoch": 1.6309406103750577, + "grad_norm": 0.09963366389274597, + "learning_rate": 3.691779918195012e-05, + "loss": 0.0097, + "step": 6185 + }, + { + "epoch": 1.6322589150352647, + "grad_norm": 0.006273225415498018, + "learning_rate": 3.6785855653780185e-05, + "loss": 0.0071, + "step": 6190 + }, + { + "epoch": 1.6335772196954716, + "grad_norm": 0.15079188346862793, + "learning_rate": 3.665391212561024e-05, + "loss": 0.0058, + "step": 6195 + }, + { + "epoch": 1.6348955243556786, + "grad_norm": 0.004980973433703184, + "learning_rate": 3.6521968597440296e-05, + "loss": 0.0051, + "step": 6200 + }, + { + "epoch": 1.6362138290158854, + "grad_norm": 0.004235363099724054, + "learning_rate": 3.639002506927036e-05, + "loss": 0.0028, + "step": 6205 + }, + { + "epoch": 1.6375321336760926, + "grad_norm": 0.003829963505268097, + "learning_rate": 3.625808154110041e-05, + "loss": 0.0347, + "step": 6210 + }, + { + "epoch": 1.6388504383362994, + "grad_norm": 0.021650686860084534, + "learning_rate": 3.612613801293046e-05, + "loss": 0.0036, + "step": 6215 + }, + { + "epoch": 1.6401687429965066, + "grad_norm": 0.06326934695243835, + "learning_rate": 3.5994194484760525e-05, + "loss": 0.0228, + "step": 6220 + }, + { + "epoch": 1.6414870476567134, + "grad_norm": 0.017276322469115257, + "learning_rate": 3.586225095659058e-05, + "loss": 0.0025, + "step": 6225 + }, + { + "epoch": 1.6428053523169206, + "grad_norm": 0.005066063720732927, + "learning_rate": 3.573030742842064e-05, + "loss": 0.0047, + "step": 6230 + }, + { + "epoch": 1.6441236569771274, + "grad_norm": 0.003512267954647541, + "learning_rate": 3.559836390025069e-05, + "loss": 0.0018, + "step": 6235 + }, + { + "epoch": 1.6454419616373344, + "grad_norm": 0.004347699694335461, + "learning_rate": 3.546642037208075e-05, + "loss": 0.0045, + "step": 6240 + }, + { + "epoch": 1.6467602662975414, + "grad_norm": 0.008277533575892448, + "learning_rate": 3.533447684391081e-05, + "loss": 0.0456, + "step": 6245 + }, + { + "epoch": 1.6480785709577483, + "grad_norm": 0.00973033718764782, + "learning_rate": 3.5202533315740866e-05, + "loss": 0.0215, + "step": 6250 + }, + { + "epoch": 1.6493968756179553, + "grad_norm": 1.9432978630065918, + "learning_rate": 3.507058978757092e-05, + "loss": 0.0132, + "step": 6255 + }, + { + "epoch": 1.6507151802781623, + "grad_norm": 0.2693535387516022, + "learning_rate": 3.493864625940098e-05, + "loss": 0.0037, + "step": 6260 + }, + { + "epoch": 1.6520334849383693, + "grad_norm": 0.02107766456902027, + "learning_rate": 3.480670273123103e-05, + "loss": 0.0031, + "step": 6265 + }, + { + "epoch": 1.653351789598576, + "grad_norm": 0.07168436795473099, + "learning_rate": 3.467475920306109e-05, + "loss": 0.0101, + "step": 6270 + }, + { + "epoch": 1.6546700942587833, + "grad_norm": 0.06479799002408981, + "learning_rate": 3.454281567489115e-05, + "loss": 0.0032, + "step": 6275 + }, + { + "epoch": 1.65598839891899, + "grad_norm": 0.0013557536294683814, + "learning_rate": 3.441087214672121e-05, + "loss": 0.0037, + "step": 6280 + }, + { + "epoch": 1.6573067035791973, + "grad_norm": 0.07330150157213211, + "learning_rate": 3.427892861855126e-05, + "loss": 0.0031, + "step": 6285 + }, + { + "epoch": 1.658625008239404, + "grad_norm": 0.08246012777090073, + "learning_rate": 3.414698509038132e-05, + "loss": 0.0028, + "step": 6290 + }, + { + "epoch": 1.659943312899611, + "grad_norm": 0.6232367157936096, + "learning_rate": 3.4015041562211374e-05, + "loss": 0.0042, + "step": 6295 + }, + { + "epoch": 1.661261617559818, + "grad_norm": 0.007676729932427406, + "learning_rate": 3.388309803404143e-05, + "loss": 0.0501, + "step": 6300 + }, + { + "epoch": 1.662579922220025, + "grad_norm": 0.02081216312944889, + "learning_rate": 3.375115450587149e-05, + "loss": 0.0047, + "step": 6305 + }, + { + "epoch": 1.663898226880232, + "grad_norm": 0.008829087018966675, + "learning_rate": 3.361921097770154e-05, + "loss": 0.0298, + "step": 6310 + }, + { + "epoch": 1.665216531540439, + "grad_norm": 0.4426127076148987, + "learning_rate": 3.34872674495316e-05, + "loss": 0.0045, + "step": 6315 + }, + { + "epoch": 1.666534836200646, + "grad_norm": 0.025818035006523132, + "learning_rate": 3.335532392136166e-05, + "loss": 0.0028, + "step": 6320 + }, + { + "epoch": 1.6678531408608528, + "grad_norm": 0.6068133115768433, + "learning_rate": 3.3223380393191715e-05, + "loss": 0.0202, + "step": 6325 + }, + { + "epoch": 1.66917144552106, + "grad_norm": 0.02740122564136982, + "learning_rate": 3.309143686502178e-05, + "loss": 0.0025, + "step": 6330 + }, + { + "epoch": 1.6704897501812668, + "grad_norm": 0.15878735482692719, + "learning_rate": 3.2959493336851826e-05, + "loss": 0.004, + "step": 6335 + }, + { + "epoch": 1.671808054841474, + "grad_norm": 0.006827466655522585, + "learning_rate": 3.282754980868188e-05, + "loss": 0.0048, + "step": 6340 + }, + { + "epoch": 1.6731263595016808, + "grad_norm": 0.19508551061153412, + "learning_rate": 3.2695606280511944e-05, + "loss": 0.0025, + "step": 6345 + }, + { + "epoch": 1.674444664161888, + "grad_norm": 0.8176754713058472, + "learning_rate": 3.2563662752342e-05, + "loss": 0.0151, + "step": 6350 + }, + { + "epoch": 1.6757629688220947, + "grad_norm": 0.011672024615108967, + "learning_rate": 3.2431719224172055e-05, + "loss": 0.0452, + "step": 6355 + }, + { + "epoch": 1.6770812734823017, + "grad_norm": 0.015824951231479645, + "learning_rate": 3.229977569600211e-05, + "loss": 0.0236, + "step": 6360 + }, + { + "epoch": 1.6783995781425087, + "grad_norm": 0.1358737051486969, + "learning_rate": 3.216783216783217e-05, + "loss": 0.0078, + "step": 6365 + }, + { + "epoch": 1.6797178828027157, + "grad_norm": 0.004896901547908783, + "learning_rate": 3.203588863966223e-05, + "loss": 0.0042, + "step": 6370 + }, + { + "epoch": 1.6810361874629227, + "grad_norm": 0.22593103349208832, + "learning_rate": 3.1903945111492285e-05, + "loss": 0.0053, + "step": 6375 + }, + { + "epoch": 1.6823544921231297, + "grad_norm": 0.0073196059092879295, + "learning_rate": 3.177200158332234e-05, + "loss": 0.0287, + "step": 6380 + }, + { + "epoch": 1.6836727967833367, + "grad_norm": 0.018524926155805588, + "learning_rate": 3.1640058055152396e-05, + "loss": 0.0122, + "step": 6385 + }, + { + "epoch": 1.6849911014435435, + "grad_norm": 0.7453815937042236, + "learning_rate": 3.150811452698245e-05, + "loss": 0.0378, + "step": 6390 + }, + { + "epoch": 1.6863094061037507, + "grad_norm": 0.22409795224666595, + "learning_rate": 3.137617099881251e-05, + "loss": 0.0282, + "step": 6395 + }, + { + "epoch": 1.6876277107639575, + "grad_norm": 0.005432693753391504, + "learning_rate": 3.124422747064257e-05, + "loss": 0.0162, + "step": 6400 + }, + { + "epoch": 1.6889460154241647, + "grad_norm": 0.1493055820465088, + "learning_rate": 3.1112283942472626e-05, + "loss": 0.0123, + "step": 6405 + }, + { + "epoch": 1.6902643200843714, + "grad_norm": 0.1638440042734146, + "learning_rate": 3.0980340414302674e-05, + "loss": 0.0058, + "step": 6410 + }, + { + "epoch": 1.6915826247445784, + "grad_norm": 0.015779908746480942, + "learning_rate": 3.084839688613274e-05, + "loss": 0.0157, + "step": 6415 + }, + { + "epoch": 1.6929009294047854, + "grad_norm": 0.0012348912423476577, + "learning_rate": 3.071645335796279e-05, + "loss": 0.0016, + "step": 6420 + }, + { + "epoch": 1.6942192340649924, + "grad_norm": 0.05294624716043472, + "learning_rate": 3.058450982979285e-05, + "loss": 0.0037, + "step": 6425 + }, + { + "epoch": 1.6955375387251994, + "grad_norm": 0.01926981844007969, + "learning_rate": 3.045256630162291e-05, + "loss": 0.0053, + "step": 6430 + }, + { + "epoch": 1.6968558433854064, + "grad_norm": 0.005958891473710537, + "learning_rate": 3.0320622773452963e-05, + "loss": 0.0025, + "step": 6435 + }, + { + "epoch": 1.6981741480456134, + "grad_norm": 0.001902201445773244, + "learning_rate": 3.018867924528302e-05, + "loss": 0.0027, + "step": 6440 + }, + { + "epoch": 1.6994924527058202, + "grad_norm": 0.036614127457141876, + "learning_rate": 3.0056735717113078e-05, + "loss": 0.0026, + "step": 6445 + }, + { + "epoch": 1.7008107573660274, + "grad_norm": 0.07294526696205139, + "learning_rate": 2.9924792188943133e-05, + "loss": 0.0042, + "step": 6450 + }, + { + "epoch": 1.7021290620262342, + "grad_norm": 0.42822372913360596, + "learning_rate": 2.9792848660773192e-05, + "loss": 0.013, + "step": 6455 + }, + { + "epoch": 1.7034473666864414, + "grad_norm": 0.036622967571020126, + "learning_rate": 2.9660905132603245e-05, + "loss": 0.0029, + "step": 6460 + }, + { + "epoch": 1.7047656713466481, + "grad_norm": 0.08314034342765808, + "learning_rate": 2.9528961604433304e-05, + "loss": 0.0043, + "step": 6465 + }, + { + "epoch": 1.7060839760068551, + "grad_norm": 0.0005654952838085592, + "learning_rate": 2.939701807626336e-05, + "loss": 0.0595, + "step": 6470 + }, + { + "epoch": 1.7074022806670621, + "grad_norm": 0.004545385017991066, + "learning_rate": 2.926507454809342e-05, + "loss": 0.0044, + "step": 6475 + }, + { + "epoch": 1.7087205853272691, + "grad_norm": 0.00033831383916549385, + "learning_rate": 2.9133131019923477e-05, + "loss": 0.0046, + "step": 6480 + }, + { + "epoch": 1.710038889987476, + "grad_norm": 0.0019903562497347593, + "learning_rate": 2.900118749175353e-05, + "loss": 0.0026, + "step": 6485 + }, + { + "epoch": 1.711357194647683, + "grad_norm": 0.10188104957342148, + "learning_rate": 2.8869243963583585e-05, + "loss": 0.0069, + "step": 6490 + }, + { + "epoch": 1.71267549930789, + "grad_norm": 0.2123432606458664, + "learning_rate": 2.8737300435413644e-05, + "loss": 0.0199, + "step": 6495 + }, + { + "epoch": 1.7139938039680969, + "grad_norm": 0.43209517002105713, + "learning_rate": 2.8605356907243703e-05, + "loss": 0.0099, + "step": 6500 + }, + { + "epoch": 1.7139938039680969, + "eval_loss": 0.024327505379915237, + "eval_runtime": 452.0052, + "eval_samples_per_second": 7.46, + "eval_steps_per_second": 3.73, + "step": 6500 + }, + { + "epoch": 1.715312108628304, + "grad_norm": 0.009868285618722439, + "learning_rate": 2.847341337907376e-05, + "loss": 0.0025, + "step": 6505 + }, + { + "epoch": 1.7166304132885108, + "grad_norm": 0.00778606254607439, + "learning_rate": 2.834146985090381e-05, + "loss": 0.0028, + "step": 6510 + }, + { + "epoch": 1.717948717948718, + "grad_norm": 0.02987460047006607, + "learning_rate": 2.820952632273387e-05, + "loss": 0.0068, + "step": 6515 + }, + { + "epoch": 1.7192670226089248, + "grad_norm": 0.04475142061710358, + "learning_rate": 2.807758279456393e-05, + "loss": 0.0022, + "step": 6520 + }, + { + "epoch": 1.720585327269132, + "grad_norm": 0.12720516324043274, + "learning_rate": 2.7945639266393985e-05, + "loss": 0.0488, + "step": 6525 + }, + { + "epoch": 1.7219036319293388, + "grad_norm": 0.0011463731061667204, + "learning_rate": 2.7813695738224044e-05, + "loss": 0.0023, + "step": 6530 + }, + { + "epoch": 1.7232219365895458, + "grad_norm": 0.008907752111554146, + "learning_rate": 2.7681752210054096e-05, + "loss": 0.0039, + "step": 6535 + }, + { + "epoch": 1.7245402412497528, + "grad_norm": 0.008416680619120598, + "learning_rate": 2.7549808681884156e-05, + "loss": 0.0055, + "step": 6540 + }, + { + "epoch": 1.7258585459099598, + "grad_norm": 0.26278871297836304, + "learning_rate": 2.741786515371421e-05, + "loss": 0.0386, + "step": 6545 + }, + { + "epoch": 1.7271768505701668, + "grad_norm": 0.01750275492668152, + "learning_rate": 2.728592162554427e-05, + "loss": 0.0048, + "step": 6550 + }, + { + "epoch": 1.7284951552303738, + "grad_norm": 0.009483959525823593, + "learning_rate": 2.7153978097374326e-05, + "loss": 0.0061, + "step": 6555 + }, + { + "epoch": 1.7298134598905808, + "grad_norm": 0.016591722145676613, + "learning_rate": 2.7022034569204378e-05, + "loss": 0.0058, + "step": 6560 + }, + { + "epoch": 1.7311317645507875, + "grad_norm": 0.5120682716369629, + "learning_rate": 2.6890091041034437e-05, + "loss": 0.0229, + "step": 6565 + }, + { + "epoch": 1.7324500692109948, + "grad_norm": 0.03748248517513275, + "learning_rate": 2.6758147512864496e-05, + "loss": 0.0026, + "step": 6570 + }, + { + "epoch": 1.7337683738712015, + "grad_norm": 0.08328749984502792, + "learning_rate": 2.6626203984694552e-05, + "loss": 0.0052, + "step": 6575 + }, + { + "epoch": 1.7350866785314087, + "grad_norm": 0.012284482829272747, + "learning_rate": 2.649426045652461e-05, + "loss": 0.0353, + "step": 6580 + }, + { + "epoch": 1.7364049831916155, + "grad_norm": 0.06362583488225937, + "learning_rate": 2.6362316928354663e-05, + "loss": 0.0309, + "step": 6585 + }, + { + "epoch": 1.7377232878518225, + "grad_norm": 0.01475360058248043, + "learning_rate": 2.6230373400184722e-05, + "loss": 0.0034, + "step": 6590 + }, + { + "epoch": 1.7390415925120295, + "grad_norm": 0.002241638721898198, + "learning_rate": 2.6098429872014778e-05, + "loss": 0.0365, + "step": 6595 + }, + { + "epoch": 1.7403598971722365, + "grad_norm": 0.11375941336154938, + "learning_rate": 2.5966486343844837e-05, + "loss": 0.0241, + "step": 6600 + }, + { + "epoch": 1.7416782018324435, + "grad_norm": 0.009631779976189137, + "learning_rate": 2.5834542815674896e-05, + "loss": 0.0026, + "step": 6605 + }, + { + "epoch": 1.7429965064926505, + "grad_norm": 0.12113262712955475, + "learning_rate": 2.570259928750495e-05, + "loss": 0.0207, + "step": 6610 + }, + { + "epoch": 1.7443148111528575, + "grad_norm": 0.006536155007779598, + "learning_rate": 2.5570655759335004e-05, + "loss": 0.0022, + "step": 6615 + }, + { + "epoch": 1.7456331158130642, + "grad_norm": 0.043030887842178345, + "learning_rate": 2.5438712231165063e-05, + "loss": 0.003, + "step": 6620 + }, + { + "epoch": 1.7469514204732715, + "grad_norm": 0.00860620103776455, + "learning_rate": 2.5306768702995122e-05, + "loss": 0.027, + "step": 6625 + }, + { + "epoch": 1.7482697251334782, + "grad_norm": 0.014589210972189903, + "learning_rate": 2.5174825174825178e-05, + "loss": 0.0224, + "step": 6630 + }, + { + "epoch": 1.7495880297936854, + "grad_norm": 0.01215316355228424, + "learning_rate": 2.504288164665523e-05, + "loss": 0.011, + "step": 6635 + }, + { + "epoch": 1.7509063344538922, + "grad_norm": 0.10951556265354156, + "learning_rate": 2.491093811848529e-05, + "loss": 0.0384, + "step": 6640 + }, + { + "epoch": 1.7522246391140994, + "grad_norm": 0.30859875679016113, + "learning_rate": 2.4778994590315345e-05, + "loss": 0.0031, + "step": 6645 + }, + { + "epoch": 1.7535429437743062, + "grad_norm": 0.025427229702472687, + "learning_rate": 2.4647051062145404e-05, + "loss": 0.0171, + "step": 6650 + }, + { + "epoch": 1.7548612484345132, + "grad_norm": 0.03334197774529457, + "learning_rate": 2.451510753397546e-05, + "loss": 0.0473, + "step": 6655 + }, + { + "epoch": 1.7561795530947202, + "grad_norm": 0.013445639982819557, + "learning_rate": 2.438316400580552e-05, + "loss": 0.0056, + "step": 6660 + }, + { + "epoch": 1.7574978577549272, + "grad_norm": 0.008306960575282574, + "learning_rate": 2.425122047763557e-05, + "loss": 0.0104, + "step": 6665 + }, + { + "epoch": 1.7588161624151342, + "grad_norm": 0.012615012936294079, + "learning_rate": 2.411927694946563e-05, + "loss": 0.0097, + "step": 6670 + }, + { + "epoch": 1.7601344670753412, + "grad_norm": 0.006827410310506821, + "learning_rate": 2.398733342129569e-05, + "loss": 0.0057, + "step": 6675 + }, + { + "epoch": 1.7614527717355482, + "grad_norm": 0.017035294324159622, + "learning_rate": 2.3855389893125745e-05, + "loss": 0.0035, + "step": 6680 + }, + { + "epoch": 1.762771076395755, + "grad_norm": 0.036102693527936935, + "learning_rate": 2.37234463649558e-05, + "loss": 0.0031, + "step": 6685 + }, + { + "epoch": 1.7640893810559621, + "grad_norm": 0.5004498958587646, + "learning_rate": 2.3591502836785856e-05, + "loss": 0.0217, + "step": 6690 + }, + { + "epoch": 1.765407685716169, + "grad_norm": 0.017726672813296318, + "learning_rate": 2.3459559308615915e-05, + "loss": 0.0112, + "step": 6695 + }, + { + "epoch": 1.7667259903763761, + "grad_norm": 0.00940331444144249, + "learning_rate": 2.332761578044597e-05, + "loss": 0.0107, + "step": 6700 + }, + { + "epoch": 1.768044295036583, + "grad_norm": 0.007495497819036245, + "learning_rate": 2.3195672252276026e-05, + "loss": 0.0032, + "step": 6705 + }, + { + "epoch": 1.7693625996967899, + "grad_norm": 0.6863199472427368, + "learning_rate": 2.3063728724106085e-05, + "loss": 0.034, + "step": 6710 + }, + { + "epoch": 1.7706809043569969, + "grad_norm": 0.004587489180266857, + "learning_rate": 2.293178519593614e-05, + "loss": 0.0032, + "step": 6715 + }, + { + "epoch": 1.7719992090172039, + "grad_norm": 0.017706016078591347, + "learning_rate": 2.2799841667766197e-05, + "loss": 0.0036, + "step": 6720 + }, + { + "epoch": 1.7733175136774109, + "grad_norm": 0.012740216217935085, + "learning_rate": 2.2667898139596252e-05, + "loss": 0.0147, + "step": 6725 + }, + { + "epoch": 1.7746358183376179, + "grad_norm": 0.010391579940915108, + "learning_rate": 2.253595461142631e-05, + "loss": 0.0041, + "step": 6730 + }, + { + "epoch": 1.7759541229978248, + "grad_norm": 0.021570540964603424, + "learning_rate": 2.2404011083256367e-05, + "loss": 0.0363, + "step": 6735 + }, + { + "epoch": 1.7772724276580316, + "grad_norm": 0.005778402555733919, + "learning_rate": 2.2272067555086423e-05, + "loss": 0.002, + "step": 6740 + }, + { + "epoch": 1.7785907323182388, + "grad_norm": 0.0, + "learning_rate": 2.2140124026916482e-05, + "loss": 0.0058, + "step": 6745 + }, + { + "epoch": 1.7799090369784456, + "grad_norm": 0.010869967751204967, + "learning_rate": 2.2008180498746537e-05, + "loss": 0.0036, + "step": 6750 + }, + { + "epoch": 1.7812273416386528, + "grad_norm": 0.04336518794298172, + "learning_rate": 2.1876236970576593e-05, + "loss": 0.0074, + "step": 6755 + }, + { + "epoch": 1.7825456462988596, + "grad_norm": 0.008664094842970371, + "learning_rate": 2.1744293442406652e-05, + "loss": 0.0027, + "step": 6760 + }, + { + "epoch": 1.7838639509590668, + "grad_norm": 0.9408183097839355, + "learning_rate": 2.1612349914236708e-05, + "loss": 0.0371, + "step": 6765 + }, + { + "epoch": 1.7851822556192736, + "grad_norm": 0.016822539269924164, + "learning_rate": 2.1480406386066763e-05, + "loss": 0.0137, + "step": 6770 + }, + { + "epoch": 1.7865005602794806, + "grad_norm": 0.00829544197767973, + "learning_rate": 2.134846285789682e-05, + "loss": 0.0134, + "step": 6775 + }, + { + "epoch": 1.7878188649396876, + "grad_norm": 0.0035508016590029, + "learning_rate": 2.1216519329726878e-05, + "loss": 0.0231, + "step": 6780 + }, + { + "epoch": 1.7891371695998946, + "grad_norm": 0.13871321082115173, + "learning_rate": 2.1084575801556937e-05, + "loss": 0.0296, + "step": 6785 + }, + { + "epoch": 1.7904554742601015, + "grad_norm": 0.002578354673460126, + "learning_rate": 2.095263227338699e-05, + "loss": 0.0178, + "step": 6790 + }, + { + "epoch": 1.7917737789203085, + "grad_norm": 0.5279458165168762, + "learning_rate": 2.082068874521705e-05, + "loss": 0.0336, + "step": 6795 + }, + { + "epoch": 1.7930920835805155, + "grad_norm": 0.0017439400544390082, + "learning_rate": 2.0688745217047104e-05, + "loss": 0.0031, + "step": 6800 + }, + { + "epoch": 1.7944103882407223, + "grad_norm": 0.007989778183400631, + "learning_rate": 2.055680168887716e-05, + "loss": 0.0081, + "step": 6805 + }, + { + "epoch": 1.7957286929009295, + "grad_norm": 0.015163813717663288, + "learning_rate": 2.042485816070722e-05, + "loss": 0.0234, + "step": 6810 + }, + { + "epoch": 1.7970469975611363, + "grad_norm": 0.10615389794111252, + "learning_rate": 2.0292914632537275e-05, + "loss": 0.0144, + "step": 6815 + }, + { + "epoch": 1.7983653022213435, + "grad_norm": 0.03466172143816948, + "learning_rate": 2.0160971104367334e-05, + "loss": 0.0036, + "step": 6820 + }, + { + "epoch": 1.7996836068815503, + "grad_norm": 0.047511328011751175, + "learning_rate": 2.0029027576197386e-05, + "loss": 0.002, + "step": 6825 + }, + { + "epoch": 1.8010019115417573, + "grad_norm": 0.019772246479988098, + "learning_rate": 1.9897084048027445e-05, + "loss": 0.0049, + "step": 6830 + }, + { + "epoch": 1.8023202162019643, + "grad_norm": 0.1156701073050499, + "learning_rate": 1.9765140519857504e-05, + "loss": 0.0033, + "step": 6835 + }, + { + "epoch": 1.8036385208621712, + "grad_norm": 0.010991690680384636, + "learning_rate": 1.963319699168756e-05, + "loss": 0.0036, + "step": 6840 + }, + { + "epoch": 1.8049568255223782, + "grad_norm": 0.29658815264701843, + "learning_rate": 1.9501253463517615e-05, + "loss": 0.0042, + "step": 6845 + }, + { + "epoch": 1.8062751301825852, + "grad_norm": 0.056147243827581406, + "learning_rate": 1.936930993534767e-05, + "loss": 0.0052, + "step": 6850 + }, + { + "epoch": 1.8075934348427922, + "grad_norm": 0.010382590815424919, + "learning_rate": 1.923736640717773e-05, + "loss": 0.0033, + "step": 6855 + }, + { + "epoch": 1.808911739502999, + "grad_norm": 1.1247020959854126, + "learning_rate": 1.9105422879007786e-05, + "loss": 0.0112, + "step": 6860 + }, + { + "epoch": 1.8102300441632062, + "grad_norm": 1.4515737295150757, + "learning_rate": 1.897347935083784e-05, + "loss": 0.0202, + "step": 6865 + }, + { + "epoch": 1.811548348823413, + "grad_norm": 0.016307830810546875, + "learning_rate": 1.88415358226679e-05, + "loss": 0.0148, + "step": 6870 + }, + { + "epoch": 1.8128666534836202, + "grad_norm": 0.0745878592133522, + "learning_rate": 1.8709592294497956e-05, + "loss": 0.0062, + "step": 6875 + }, + { + "epoch": 1.814184958143827, + "grad_norm": 0.02554013952612877, + "learning_rate": 1.8577648766328012e-05, + "loss": 0.003, + "step": 6880 + }, + { + "epoch": 1.815503262804034, + "grad_norm": 0.45748665928840637, + "learning_rate": 1.844570523815807e-05, + "loss": 0.0386, + "step": 6885 + }, + { + "epoch": 1.816821567464241, + "grad_norm": 0.013801589608192444, + "learning_rate": 1.8313761709988126e-05, + "loss": 0.0342, + "step": 6890 + }, + { + "epoch": 1.818139872124448, + "grad_norm": 0.6251696944236755, + "learning_rate": 1.8181818181818182e-05, + "loss": 0.0101, + "step": 6895 + }, + { + "epoch": 1.819458176784655, + "grad_norm": 0.28203102946281433, + "learning_rate": 1.8049874653648238e-05, + "loss": 0.0032, + "step": 6900 + }, + { + "epoch": 1.820776481444862, + "grad_norm": 0.28511062264442444, + "learning_rate": 1.7917931125478297e-05, + "loss": 0.0343, + "step": 6905 + }, + { + "epoch": 1.822094786105069, + "grad_norm": 0.004940215498209, + "learning_rate": 1.7785987597308352e-05, + "loss": 0.0265, + "step": 6910 + }, + { + "epoch": 1.8234130907652757, + "grad_norm": 0.002903093583881855, + "learning_rate": 1.7654044069138408e-05, + "loss": 0.0025, + "step": 6915 + }, + { + "epoch": 1.824731395425483, + "grad_norm": 0.008801674470305443, + "learning_rate": 1.7522100540968467e-05, + "loss": 0.0246, + "step": 6920 + }, + { + "epoch": 1.8260497000856897, + "grad_norm": 0.13823826611042023, + "learning_rate": 1.7390157012798523e-05, + "loss": 0.0058, + "step": 6925 + }, + { + "epoch": 1.827368004745897, + "grad_norm": 0.020868878811597824, + "learning_rate": 1.725821348462858e-05, + "loss": 0.0014, + "step": 6930 + }, + { + "epoch": 1.8286863094061037, + "grad_norm": 0.0027356524951756, + "learning_rate": 1.7126269956458638e-05, + "loss": 0.0035, + "step": 6935 + }, + { + "epoch": 1.8300046140663109, + "grad_norm": 0.06023023650050163, + "learning_rate": 1.6994326428288693e-05, + "loss": 0.0212, + "step": 6940 + }, + { + "epoch": 1.8313229187265176, + "grad_norm": 0.0009826788445934653, + "learning_rate": 1.686238290011875e-05, + "loss": 0.0034, + "step": 6945 + }, + { + "epoch": 1.8326412233867246, + "grad_norm": 0.2867647707462311, + "learning_rate": 1.6730439371948805e-05, + "loss": 0.0146, + "step": 6950 + }, + { + "epoch": 1.8339595280469316, + "grad_norm": 0.004501632414758205, + "learning_rate": 1.6598495843778864e-05, + "loss": 0.0026, + "step": 6955 + }, + { + "epoch": 1.8352778327071386, + "grad_norm": 0.01251616608351469, + "learning_rate": 1.6466552315608923e-05, + "loss": 0.0107, + "step": 6960 + }, + { + "epoch": 1.8365961373673456, + "grad_norm": 0.054781850427389145, + "learning_rate": 1.6334608787438975e-05, + "loss": 0.0044, + "step": 6965 + }, + { + "epoch": 1.8379144420275526, + "grad_norm": 0.1120501235127449, + "learning_rate": 1.6202665259269034e-05, + "loss": 0.0284, + "step": 6970 + }, + { + "epoch": 1.8392327466877596, + "grad_norm": 0.001668553682975471, + "learning_rate": 1.607072173109909e-05, + "loss": 0.0169, + "step": 6975 + }, + { + "epoch": 1.8405510513479664, + "grad_norm": 1.6374458074569702, + "learning_rate": 1.593877820292915e-05, + "loss": 0.031, + "step": 6980 + }, + { + "epoch": 1.8418693560081736, + "grad_norm": 0.012474550865590572, + "learning_rate": 1.5806834674759204e-05, + "loss": 0.0037, + "step": 6985 + }, + { + "epoch": 1.8431876606683804, + "grad_norm": 0.014898869208991528, + "learning_rate": 1.567489114658926e-05, + "loss": 0.003, + "step": 6990 + }, + { + "epoch": 1.8445059653285876, + "grad_norm": 0.035570453852415085, + "learning_rate": 1.554294761841932e-05, + "loss": 0.0038, + "step": 6995 + }, + { + "epoch": 1.8458242699887943, + "grad_norm": 0.9279152750968933, + "learning_rate": 1.541100409024937e-05, + "loss": 0.0235, + "step": 7000 + }, + { + "epoch": 1.8458242699887943, + "eval_loss": 0.022339830175042152, + "eval_runtime": 451.9068, + "eval_samples_per_second": 7.462, + "eval_steps_per_second": 3.731, + "step": 7000 + }, + { + "epoch": 1.8471425746490013, + "grad_norm": 0.0551234595477581, + "learning_rate": 1.527906056207943e-05, + "loss": 0.0111, + "step": 7005 + }, + { + "epoch": 1.8484608793092083, + "grad_norm": 0.011982420459389687, + "learning_rate": 1.514711703390949e-05, + "loss": 0.0279, + "step": 7010 + }, + { + "epoch": 1.8497791839694153, + "grad_norm": 0.0005129808560013771, + "learning_rate": 1.5015173505739543e-05, + "loss": 0.006, + "step": 7015 + }, + { + "epoch": 1.8510974886296223, + "grad_norm": 0.00803748145699501, + "learning_rate": 1.4883229977569602e-05, + "loss": 0.0039, + "step": 7020 + }, + { + "epoch": 1.8524157932898293, + "grad_norm": 0.012161086313426495, + "learning_rate": 1.4751286449399656e-05, + "loss": 0.002, + "step": 7025 + }, + { + "epoch": 1.8537340979500363, + "grad_norm": 0.09517266601324081, + "learning_rate": 1.4619342921229714e-05, + "loss": 0.0021, + "step": 7030 + }, + { + "epoch": 1.855052402610243, + "grad_norm": 0.024397969245910645, + "learning_rate": 1.4487399393059773e-05, + "loss": 0.0031, + "step": 7035 + }, + { + "epoch": 1.8563707072704503, + "grad_norm": 0.010253255255520344, + "learning_rate": 1.4355455864889827e-05, + "loss": 0.005, + "step": 7040 + }, + { + "epoch": 1.857689011930657, + "grad_norm": 0.30331942439079285, + "learning_rate": 1.4223512336719886e-05, + "loss": 0.0048, + "step": 7045 + }, + { + "epoch": 1.8590073165908643, + "grad_norm": 0.4087940454483032, + "learning_rate": 1.409156880854994e-05, + "loss": 0.0375, + "step": 7050 + }, + { + "epoch": 1.860325621251071, + "grad_norm": 0.01011588517576456, + "learning_rate": 1.3959625280379999e-05, + "loss": 0.0218, + "step": 7055 + }, + { + "epoch": 1.8616439259112783, + "grad_norm": 0.004677001852542162, + "learning_rate": 1.3827681752210056e-05, + "loss": 0.0183, + "step": 7060 + }, + { + "epoch": 1.862962230571485, + "grad_norm": 0.8648074269294739, + "learning_rate": 1.3695738224040112e-05, + "loss": 0.0243, + "step": 7065 + }, + { + "epoch": 1.864280535231692, + "grad_norm": 0.00018874031957238913, + "learning_rate": 1.356379469587017e-05, + "loss": 0.0204, + "step": 7070 + }, + { + "epoch": 1.865598839891899, + "grad_norm": 0.010363743640482426, + "learning_rate": 1.3431851167700223e-05, + "loss": 0.0151, + "step": 7075 + }, + { + "epoch": 1.866917144552106, + "grad_norm": 0.015046795830130577, + "learning_rate": 1.3299907639530282e-05, + "loss": 0.0028, + "step": 7080 + }, + { + "epoch": 1.868235449212313, + "grad_norm": 0.00892347190529108, + "learning_rate": 1.316796411136034e-05, + "loss": 0.0076, + "step": 7085 + }, + { + "epoch": 1.86955375387252, + "grad_norm": 0.011039508506655693, + "learning_rate": 1.3036020583190395e-05, + "loss": 0.0041, + "step": 7090 + }, + { + "epoch": 1.870872058532727, + "grad_norm": 0.612829864025116, + "learning_rate": 1.2904077055020453e-05, + "loss": 0.0406, + "step": 7095 + }, + { + "epoch": 1.8721903631929337, + "grad_norm": 0.02630307897925377, + "learning_rate": 1.2772133526850508e-05, + "loss": 0.005, + "step": 7100 + }, + { + "epoch": 1.873508667853141, + "grad_norm": 0.05626239255070686, + "learning_rate": 1.2640189998680566e-05, + "loss": 0.0494, + "step": 7105 + }, + { + "epoch": 1.8748269725133477, + "grad_norm": 0.009870803914964199, + "learning_rate": 1.2508246470510623e-05, + "loss": 0.0036, + "step": 7110 + }, + { + "epoch": 1.876145277173555, + "grad_norm": 0.0034679556265473366, + "learning_rate": 1.2376302942340679e-05, + "loss": 0.0026, + "step": 7115 + }, + { + "epoch": 1.8774635818337617, + "grad_norm": 0.0021383303683251143, + "learning_rate": 1.2244359414170734e-05, + "loss": 0.0028, + "step": 7120 + }, + { + "epoch": 1.8787818864939687, + "grad_norm": 0.016683265566825867, + "learning_rate": 1.2112415886000793e-05, + "loss": 0.0027, + "step": 7125 + }, + { + "epoch": 1.8801001911541757, + "grad_norm": 0.30483224987983704, + "learning_rate": 1.1980472357830849e-05, + "loss": 0.0081, + "step": 7130 + }, + { + "epoch": 1.8814184958143827, + "grad_norm": 0.055007629096508026, + "learning_rate": 1.1848528829660906e-05, + "loss": 0.0058, + "step": 7135 + }, + { + "epoch": 1.8827368004745897, + "grad_norm": 0.013665193691849709, + "learning_rate": 1.1716585301490962e-05, + "loss": 0.0289, + "step": 7140 + }, + { + "epoch": 1.8840551051347967, + "grad_norm": 0.004984239581972361, + "learning_rate": 1.158464177332102e-05, + "loss": 0.0049, + "step": 7145 + }, + { + "epoch": 1.8853734097950037, + "grad_norm": 0.4533900320529938, + "learning_rate": 1.1452698245151077e-05, + "loss": 0.0127, + "step": 7150 + }, + { + "epoch": 1.8866917144552104, + "grad_norm": 0.8876304030418396, + "learning_rate": 1.1320754716981132e-05, + "loss": 0.0124, + "step": 7155 + }, + { + "epoch": 1.8880100191154177, + "grad_norm": 0.004243049304932356, + "learning_rate": 1.118881118881119e-05, + "loss": 0.0034, + "step": 7160 + }, + { + "epoch": 1.8893283237756244, + "grad_norm": 0.0425611287355423, + "learning_rate": 1.1056867660641245e-05, + "loss": 0.0108, + "step": 7165 + }, + { + "epoch": 1.8906466284358316, + "grad_norm": 0.005729519762098789, + "learning_rate": 1.0924924132471303e-05, + "loss": 0.0127, + "step": 7170 + }, + { + "epoch": 1.8919649330960384, + "grad_norm": 0.02838645875453949, + "learning_rate": 1.079298060430136e-05, + "loss": 0.0055, + "step": 7175 + }, + { + "epoch": 1.8932832377562456, + "grad_norm": 0.007703767623752356, + "learning_rate": 1.0661037076131416e-05, + "loss": 0.0029, + "step": 7180 + }, + { + "epoch": 1.8946015424164524, + "grad_norm": 0.00507679907605052, + "learning_rate": 1.0529093547961473e-05, + "loss": 0.0019, + "step": 7185 + }, + { + "epoch": 1.8959198470766594, + "grad_norm": 0.3672322630882263, + "learning_rate": 1.0397150019791529e-05, + "loss": 0.0502, + "step": 7190 + }, + { + "epoch": 1.8972381517368664, + "grad_norm": 0.002695564879104495, + "learning_rate": 1.0265206491621586e-05, + "loss": 0.0022, + "step": 7195 + }, + { + "epoch": 1.8985564563970734, + "grad_norm": 0.6876901984214783, + "learning_rate": 1.0133262963451644e-05, + "loss": 0.0384, + "step": 7200 + }, + { + "epoch": 1.8998747610572804, + "grad_norm": 0.0224466510117054, + "learning_rate": 1.0001319435281701e-05, + "loss": 0.0019, + "step": 7205 + }, + { + "epoch": 1.9011930657174874, + "grad_norm": 0.21176137030124664, + "learning_rate": 9.869375907111757e-06, + "loss": 0.0214, + "step": 7210 + }, + { + "epoch": 1.9025113703776944, + "grad_norm": 0.01078563928604126, + "learning_rate": 9.737432378941814e-06, + "loss": 0.0045, + "step": 7215 + }, + { + "epoch": 1.9038296750379011, + "grad_norm": 0.0053437380120158195, + "learning_rate": 9.60548885077187e-06, + "loss": 0.0038, + "step": 7220 + }, + { + "epoch": 1.9051479796981083, + "grad_norm": 0.019259070977568626, + "learning_rate": 9.473545322601927e-06, + "loss": 0.0025, + "step": 7225 + }, + { + "epoch": 1.906466284358315, + "grad_norm": 0.01319583784788847, + "learning_rate": 9.341601794431984e-06, + "loss": 0.0036, + "step": 7230 + }, + { + "epoch": 1.9077845890185223, + "grad_norm": 0.012393418699502945, + "learning_rate": 9.20965826626204e-06, + "loss": 0.0033, + "step": 7235 + }, + { + "epoch": 1.909102893678729, + "grad_norm": 0.37064847350120544, + "learning_rate": 9.077714738092097e-06, + "loss": 0.039, + "step": 7240 + }, + { + "epoch": 1.910421198338936, + "grad_norm": 0.012969265691936016, + "learning_rate": 8.945771209922153e-06, + "loss": 0.0023, + "step": 7245 + }, + { + "epoch": 1.911739502999143, + "grad_norm": 0.187465637922287, + "learning_rate": 8.81382768175221e-06, + "loss": 0.0176, + "step": 7250 + }, + { + "epoch": 1.91305780765935, + "grad_norm": 0.22874793410301208, + "learning_rate": 8.681884153582268e-06, + "loss": 0.0028, + "step": 7255 + }, + { + "epoch": 1.914376112319557, + "grad_norm": 0.015071459114551544, + "learning_rate": 8.549940625412323e-06, + "loss": 0.0197, + "step": 7260 + }, + { + "epoch": 1.915694416979764, + "grad_norm": 0.0037113677244633436, + "learning_rate": 8.41799709724238e-06, + "loss": 0.0026, + "step": 7265 + }, + { + "epoch": 1.917012721639971, + "grad_norm": 0.1318834125995636, + "learning_rate": 8.286053569072436e-06, + "loss": 0.0025, + "step": 7270 + }, + { + "epoch": 1.9183310263001778, + "grad_norm": 0.025826094672083855, + "learning_rate": 8.154110040902495e-06, + "loss": 0.0031, + "step": 7275 + }, + { + "epoch": 1.919649330960385, + "grad_norm": 0.02691330574452877, + "learning_rate": 8.022166512732551e-06, + "loss": 0.0038, + "step": 7280 + }, + { + "epoch": 1.9209676356205918, + "grad_norm": 0.026079120114445686, + "learning_rate": 7.890222984562608e-06, + "loss": 0.0035, + "step": 7285 + }, + { + "epoch": 1.922285940280799, + "grad_norm": 0.20154571533203125, + "learning_rate": 7.758279456392664e-06, + "loss": 0.0031, + "step": 7290 + }, + { + "epoch": 1.9236042449410058, + "grad_norm": 0.000983367906883359, + "learning_rate": 7.6263359282227206e-06, + "loss": 0.0048, + "step": 7295 + }, + { + "epoch": 1.924922549601213, + "grad_norm": 0.018915656954050064, + "learning_rate": 7.494392400052777e-06, + "loss": 0.0028, + "step": 7300 + }, + { + "epoch": 1.9262408542614198, + "grad_norm": 0.028158968314528465, + "learning_rate": 7.362448871882835e-06, + "loss": 0.0055, + "step": 7305 + }, + { + "epoch": 1.9275591589216268, + "grad_norm": 0.00263324286788702, + "learning_rate": 7.230505343712892e-06, + "loss": 0.0052, + "step": 7310 + }, + { + "epoch": 1.9288774635818338, + "grad_norm": 0.009877101518213749, + "learning_rate": 7.0985618155429475e-06, + "loss": 0.0023, + "step": 7315 + }, + { + "epoch": 1.9301957682420408, + "grad_norm": 0.0, + "learning_rate": 6.966618287373004e-06, + "loss": 0.0359, + "step": 7320 + }, + { + "epoch": 1.9315140729022477, + "grad_norm": 0.458391398191452, + "learning_rate": 6.8346747592030605e-06, + "loss": 0.0074, + "step": 7325 + }, + { + "epoch": 1.9328323775624545, + "grad_norm": 0.01231459341943264, + "learning_rate": 6.702731231033119e-06, + "loss": 0.0037, + "step": 7330 + }, + { + "epoch": 1.9341506822226617, + "grad_norm": 0.020665613934397697, + "learning_rate": 6.570787702863175e-06, + "loss": 0.0042, + "step": 7335 + }, + { + "epoch": 1.9354689868828685, + "grad_norm": 0.10898768156766891, + "learning_rate": 6.438844174693232e-06, + "loss": 0.0231, + "step": 7340 + }, + { + "epoch": 1.9367872915430757, + "grad_norm": 0.007229386828839779, + "learning_rate": 6.306900646523288e-06, + "loss": 0.0046, + "step": 7345 + }, + { + "epoch": 1.9381055962032825, + "grad_norm": 0.06096978858113289, + "learning_rate": 6.1749571183533456e-06, + "loss": 0.0032, + "step": 7350 + }, + { + "epoch": 1.9394239008634897, + "grad_norm": 0.0034814421087503433, + "learning_rate": 6.043013590183401e-06, + "loss": 0.0131, + "step": 7355 + }, + { + "epoch": 1.9407422055236965, + "grad_norm": 0.04464314505457878, + "learning_rate": 5.9110700620134586e-06, + "loss": 0.0075, + "step": 7360 + }, + { + "epoch": 1.9420605101839035, + "grad_norm": 0.024586567655205727, + "learning_rate": 5.779126533843515e-06, + "loss": 0.0051, + "step": 7365 + }, + { + "epoch": 1.9433788148441105, + "grad_norm": 0.1543113738298416, + "learning_rate": 5.647183005673572e-06, + "loss": 0.0037, + "step": 7370 + }, + { + "epoch": 1.9446971195043175, + "grad_norm": 0.2567637860774994, + "learning_rate": 5.515239477503629e-06, + "loss": 0.0407, + "step": 7375 + }, + { + "epoch": 1.9460154241645244, + "grad_norm": 0.00815210398286581, + "learning_rate": 5.3832959493336855e-06, + "loss": 0.0048, + "step": 7380 + }, + { + "epoch": 1.9473337288247314, + "grad_norm": 0.010180729441344738, + "learning_rate": 5.251352421163743e-06, + "loss": 0.002, + "step": 7385 + }, + { + "epoch": 1.9486520334849384, + "grad_norm": 0.01142155285924673, + "learning_rate": 5.1194088929937985e-06, + "loss": 0.0075, + "step": 7390 + }, + { + "epoch": 1.9499703381451452, + "grad_norm": 0.005243134684860706, + "learning_rate": 4.987465364823855e-06, + "loss": 0.002, + "step": 7395 + }, + { + "epoch": 1.9512886428053524, + "grad_norm": 0.289771169424057, + "learning_rate": 4.855521836653912e-06, + "loss": 0.016, + "step": 7400 + }, + { + "epoch": 1.9526069474655592, + "grad_norm": 0.0042951651848852634, + "learning_rate": 4.723578308483969e-06, + "loss": 0.0301, + "step": 7405 + }, + { + "epoch": 1.9539252521257664, + "grad_norm": 0.27518171072006226, + "learning_rate": 4.591634780314026e-06, + "loss": 0.0059, + "step": 7410 + }, + { + "epoch": 1.9552435567859732, + "grad_norm": 0.13695034384727478, + "learning_rate": 4.459691252144083e-06, + "loss": 0.0039, + "step": 7415 + }, + { + "epoch": 1.9565618614461802, + "grad_norm": 0.32960009574890137, + "learning_rate": 4.327747723974139e-06, + "loss": 0.0159, + "step": 7420 + }, + { + "epoch": 1.9578801661063872, + "grad_norm": 0.02581116557121277, + "learning_rate": 4.195804195804197e-06, + "loss": 0.003, + "step": 7425 + }, + { + "epoch": 1.9591984707665941, + "grad_norm": 0.01672324910759926, + "learning_rate": 4.063860667634252e-06, + "loss": 0.0233, + "step": 7430 + }, + { + "epoch": 1.9605167754268011, + "grad_norm": 0.021988827735185623, + "learning_rate": 3.93191713946431e-06, + "loss": 0.0029, + "step": 7435 + }, + { + "epoch": 1.9618350800870081, + "grad_norm": 0.27279871702194214, + "learning_rate": 3.799973611294366e-06, + "loss": 0.0023, + "step": 7440 + }, + { + "epoch": 1.9631533847472151, + "grad_norm": 0.002893030410632491, + "learning_rate": 3.6680300831244226e-06, + "loss": 0.0031, + "step": 7445 + }, + { + "epoch": 1.964471689407422, + "grad_norm": 0.024236679077148438, + "learning_rate": 3.53608655495448e-06, + "loss": 0.0519, + "step": 7450 + }, + { + "epoch": 1.965789994067629, + "grad_norm": 0.011010506190359592, + "learning_rate": 3.4041430267845365e-06, + "loss": 0.009, + "step": 7455 + }, + { + "epoch": 1.9671082987278359, + "grad_norm": 0.11187774688005447, + "learning_rate": 3.2721994986145926e-06, + "loss": 0.0197, + "step": 7460 + }, + { + "epoch": 1.968426603388043, + "grad_norm": 0.09504564106464386, + "learning_rate": 3.14025597044465e-06, + "loss": 0.0261, + "step": 7465 + }, + { + "epoch": 1.9697449080482499, + "grad_norm": 0.020677559077739716, + "learning_rate": 3.008312442274707e-06, + "loss": 0.0232, + "step": 7470 + }, + { + "epoch": 1.971063212708457, + "grad_norm": 0.020134272053837776, + "learning_rate": 2.8763689141047634e-06, + "loss": 0.0208, + "step": 7475 + }, + { + "epoch": 1.9723815173686639, + "grad_norm": 0.07275384664535522, + "learning_rate": 2.74442538593482e-06, + "loss": 0.0223, + "step": 7480 + }, + { + "epoch": 1.9736998220288708, + "grad_norm": 0.001021620468236506, + "learning_rate": 2.612481857764877e-06, + "loss": 0.003, + "step": 7485 + }, + { + "epoch": 1.9750181266890778, + "grad_norm": 0.011956339702010155, + "learning_rate": 2.4805383295949337e-06, + "loss": 0.0083, + "step": 7490 + }, + { + "epoch": 1.9763364313492848, + "grad_norm": 0.017296286299824715, + "learning_rate": 2.3485948014249902e-06, + "loss": 0.0431, + "step": 7495 + }, + { + "epoch": 1.9776547360094918, + "grad_norm": 0.0122813880443573, + "learning_rate": 2.2166512732550468e-06, + "loss": 0.0146, + "step": 7500 + }, + { + "epoch": 1.9776547360094918, + "eval_loss": 0.02133146859705448, + "eval_runtime": 451.6814, + "eval_samples_per_second": 7.465, + "eval_steps_per_second": 3.733, + "step": 7500 + } + ], + "logging_steps": 5, + "max_steps": 7584, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.053124004124652e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-7500/training_args.bin b/checkpoint-7500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4c05e0585b1aef03c0cbe4c50207ce04940bd838 --- /dev/null +++ b/checkpoint-7500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79dfa687fdd0c9908ab6b63535817e7567b29b0b483ac228723218f6f5fdeec5 +size 5688 diff --git a/checkpoint-7584/README.md b/checkpoint-7584/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e738b5eda9883f4a45efd9d37b8b8357ba758456 --- /dev/null +++ b/checkpoint-7584/README.md @@ -0,0 +1,202 @@ +--- +base_model: unsloth/Llama-3.2-3B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-7584/adapter_config.json b/checkpoint-7584/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..852d02d4dc1204f4332be8b3363041a3d3e3feb3 --- /dev/null +++ b/checkpoint-7584/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "unsloth/Llama-3.2-3B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "gate_proj", + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-7584/adapter_model.safetensors b/checkpoint-7584/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c9461c44501cae4d32e4472f00f8746d7047992f --- /dev/null +++ b/checkpoint-7584/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6a808088f5ab2cfb7a66f7e0381dbfde54a4ff294c1d1a42dccf243788915c7 +size 97307544 diff --git a/checkpoint-7584/optimizer.pt b/checkpoint-7584/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e13bba8c539c89336a0358bd3927f9c4dfa7cfb --- /dev/null +++ b/checkpoint-7584/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db1c601aab8c44acc8456fa277a096e1904a218ac81a43ffee9256e7e750e3b4 +size 50866370 diff --git a/checkpoint-7584/rng_state.pth b/checkpoint-7584/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6184fd4d1318a0aa9f43ef428e9eac008b905459 --- /dev/null +++ b/checkpoint-7584/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a70fcdbdddd24f5a3236fc25540034ecb89a2d4e934b73da08b17b4d5ac00b3 +size 14244 diff --git a/checkpoint-7584/scheduler.pt b/checkpoint-7584/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f45579553dbe8473d77a8d885d7b5cb3db0bb15d --- /dev/null +++ b/checkpoint-7584/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c06c902c264ffe3f79d6f7e5c817d8f39caaa113cb0f278ee08dac0410cec21 +size 1064 diff --git a/checkpoint-7584/special_tokens_map.json b/checkpoint-7584/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/checkpoint-7584/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-7584/tokenizer.json b/checkpoint-7584/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-7584/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-7584/tokenizer_config.json b/checkpoint-7584/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfd835b0ef1033e89629af6e13ae45397e9fa2f8 --- /dev/null +++ b/checkpoint-7584/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 July 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\n\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\n\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\n\" }}\n{{- \"Today Date: \" + date_string + \"\n\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content'] %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\n\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\n\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\n\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "right", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/checkpoint-7584/trainer_state.json b/checkpoint-7584/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2fa9f1e33e875b1dca928687cd5c4752038ec2ec --- /dev/null +++ b/checkpoint-7584/trainer_state.json @@ -0,0 +1,10768 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.999802254300969, + "eval_steps": 500, + "global_step": 7584, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001318304660206974, + "grad_norm": 4.59375, + "learning_rate": 0.0002, + "loss": 1.9624, + "step": 5 + }, + { + "epoch": 0.002636609320413948, + "grad_norm": 1.7421875, + "learning_rate": 0.00019986805647183008, + "loss": 0.6513, + "step": 10 + }, + { + "epoch": 0.003954913980620921, + "grad_norm": 1.84375, + "learning_rate": 0.00019973611294366012, + "loss": 0.1146, + "step": 15 + }, + { + "epoch": 0.005273218640827896, + "grad_norm": 1.3203125, + "learning_rate": 0.0001996041694154902, + "loss": 0.0529, + "step": 20 + }, + { + "epoch": 0.006591523301034869, + "grad_norm": 0.40234375, + "learning_rate": 0.00019947222588732023, + "loss": 0.1214, + "step": 25 + }, + { + "epoch": 0.007909827961241843, + "grad_norm": 1.5390625, + "learning_rate": 0.0001993402823591503, + "loss": 0.0919, + "step": 30 + }, + { + "epoch": 0.009228132621448816, + "grad_norm": 0.06201171875, + "learning_rate": 0.00019920833883098034, + "loss": 0.09, + "step": 35 + }, + { + "epoch": 0.010546437281655791, + "grad_norm": 1.53125, + "learning_rate": 0.0001990763953028104, + "loss": 0.1945, + "step": 40 + }, + { + "epoch": 0.011864741941862765, + "grad_norm": 0.2890625, + "learning_rate": 0.00019894445177464048, + "loss": 0.1259, + "step": 45 + }, + { + "epoch": 0.013183046602069738, + "grad_norm": 0.609375, + "learning_rate": 0.00019881250824647052, + "loss": 0.027, + "step": 50 + }, + { + "epoch": 0.014501351262276712, + "grad_norm": 0.369140625, + "learning_rate": 0.00019868056471830057, + "loss": 0.1068, + "step": 55 + }, + { + "epoch": 0.015819655922483685, + "grad_norm": 0.34765625, + "learning_rate": 0.00019854862119013064, + "loss": 0.0542, + "step": 60 + }, + { + "epoch": 0.01713796058269066, + "grad_norm": 0.055419921875, + "learning_rate": 0.00019841667766196068, + "loss": 0.0901, + "step": 65 + }, + { + "epoch": 0.018456265242897632, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00019828473413379075, + "loss": 0.0091, + "step": 70 + }, + { + "epoch": 0.019774569903104607, + "grad_norm": 0.0079345703125, + "learning_rate": 0.0001981527906056208, + "loss": 0.0744, + "step": 75 + }, + { + "epoch": 0.021092874563311582, + "grad_norm": 0.65234375, + "learning_rate": 0.00019802084707745086, + "loss": 0.1108, + "step": 80 + }, + { + "epoch": 0.022411179223518554, + "grad_norm": 0.50390625, + "learning_rate": 0.0001978889035492809, + "loss": 0.0446, + "step": 85 + }, + { + "epoch": 0.02372948388372553, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019775696002111097, + "loss": 0.0982, + "step": 90 + }, + { + "epoch": 0.0250477885439325, + "grad_norm": 0.490234375, + "learning_rate": 0.00019762501649294104, + "loss": 0.1035, + "step": 95 + }, + { + "epoch": 0.026366093204139476, + "grad_norm": 0.12158203125, + "learning_rate": 0.00019749307296477108, + "loss": 0.0401, + "step": 100 + }, + { + "epoch": 0.02768439786434645, + "grad_norm": 0.16015625, + "learning_rate": 0.00019736112943660115, + "loss": 0.0309, + "step": 105 + }, + { + "epoch": 0.029002702524553423, + "grad_norm": 1.359375, + "learning_rate": 0.0001972291859084312, + "loss": 0.1032, + "step": 110 + }, + { + "epoch": 0.0303210071847604, + "grad_norm": 0.52734375, + "learning_rate": 0.00019709724238026126, + "loss": 0.0811, + "step": 115 + }, + { + "epoch": 0.03163931184496737, + "grad_norm": 0.177734375, + "learning_rate": 0.00019696529885209133, + "loss": 0.0258, + "step": 120 + }, + { + "epoch": 0.03295761650517435, + "grad_norm": 0.234375, + "learning_rate": 0.00019683335532392137, + "loss": 0.0437, + "step": 125 + }, + { + "epoch": 0.03427592116538132, + "grad_norm": 1.3046875, + "learning_rate": 0.00019670141179575144, + "loss": 0.0967, + "step": 130 + }, + { + "epoch": 0.03559422582558829, + "grad_norm": 0.2734375, + "learning_rate": 0.00019656946826758148, + "loss": 0.0132, + "step": 135 + }, + { + "epoch": 0.036912530485795264, + "grad_norm": 0.66015625, + "learning_rate": 0.00019643752473941155, + "loss": 0.0396, + "step": 140 + }, + { + "epoch": 0.03823083514600224, + "grad_norm": 1.0546875, + "learning_rate": 0.0001963055812112416, + "loss": 0.0449, + "step": 145 + }, + { + "epoch": 0.039549139806209214, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019617363768307166, + "loss": 0.1196, + "step": 150 + }, + { + "epoch": 0.040867444466416186, + "grad_norm": 0.5859375, + "learning_rate": 0.0001960416941549017, + "loss": 0.0588, + "step": 155 + }, + { + "epoch": 0.042185749126623165, + "grad_norm": 0.06005859375, + "learning_rate": 0.00019590975062673175, + "loss": 0.0234, + "step": 160 + }, + { + "epoch": 0.04350405378683014, + "grad_norm": 0.4921875, + "learning_rate": 0.00019577780709856182, + "loss": 0.0916, + "step": 165 + }, + { + "epoch": 0.04482235844703711, + "grad_norm": 0.84375, + "learning_rate": 0.0001956458635703919, + "loss": 0.0271, + "step": 170 + }, + { + "epoch": 0.04614066310724409, + "grad_norm": 0.8828125, + "learning_rate": 0.00019551392004222193, + "loss": 0.0175, + "step": 175 + }, + { + "epoch": 0.04745896776745106, + "grad_norm": 0.0152587890625, + "learning_rate": 0.000195381976514052, + "loss": 0.0356, + "step": 180 + }, + { + "epoch": 0.04877727242765803, + "grad_norm": 0.09326171875, + "learning_rate": 0.00019525003298588204, + "loss": 0.0057, + "step": 185 + }, + { + "epoch": 0.050095577087865, + "grad_norm": 0.24609375, + "learning_rate": 0.0001951180894577121, + "loss": 0.0082, + "step": 190 + }, + { + "epoch": 0.05141388174807198, + "grad_norm": 0.05029296875, + "learning_rate": 0.00019498614592954215, + "loss": 0.0178, + "step": 195 + }, + { + "epoch": 0.05273218640827895, + "grad_norm": 0.0390625, + "learning_rate": 0.00019485420240137222, + "loss": 0.0789, + "step": 200 + }, + { + "epoch": 0.054050491068485924, + "grad_norm": 0.5625, + "learning_rate": 0.0001947222588732023, + "loss": 0.0645, + "step": 205 + }, + { + "epoch": 0.0553687957286929, + "grad_norm": 0.53515625, + "learning_rate": 0.00019459031534503233, + "loss": 0.116, + "step": 210 + }, + { + "epoch": 0.056687100388899875, + "grad_norm": 0.55078125, + "learning_rate": 0.0001944583718168624, + "loss": 0.0516, + "step": 215 + }, + { + "epoch": 0.058005405049106847, + "grad_norm": 0.314453125, + "learning_rate": 0.00019432642828869244, + "loss": 0.1019, + "step": 220 + }, + { + "epoch": 0.059323709709313825, + "grad_norm": 0.1123046875, + "learning_rate": 0.0001941944847605225, + "loss": 0.0529, + "step": 225 + }, + { + "epoch": 0.0606420143695208, + "grad_norm": 0.4921875, + "learning_rate": 0.00019406254123235256, + "loss": 0.0368, + "step": 230 + }, + { + "epoch": 0.06196031902972777, + "grad_norm": 0.054443359375, + "learning_rate": 0.00019393059770418262, + "loss": 0.037, + "step": 235 + }, + { + "epoch": 0.06327862368993474, + "grad_norm": 0.008544921875, + "learning_rate": 0.0001937986541760127, + "loss": 0.0324, + "step": 240 + }, + { + "epoch": 0.06459692835014172, + "grad_norm": 1.5, + "learning_rate": 0.00019366671064784274, + "loss": 0.0334, + "step": 245 + }, + { + "epoch": 0.0659152330103487, + "grad_norm": 0.2109375, + "learning_rate": 0.0001935347671196728, + "loss": 0.0671, + "step": 250 + }, + { + "epoch": 0.06723353767055566, + "grad_norm": 2.0625, + "learning_rate": 0.00019340282359150285, + "loss": 0.1559, + "step": 255 + }, + { + "epoch": 0.06855184233076264, + "grad_norm": 0.7734375, + "learning_rate": 0.0001932708800633329, + "loss": 0.0198, + "step": 260 + }, + { + "epoch": 0.06987014699096962, + "grad_norm": 0.42578125, + "learning_rate": 0.00019313893653516296, + "loss": 0.0151, + "step": 265 + }, + { + "epoch": 0.07118845165117658, + "grad_norm": 0.1884765625, + "learning_rate": 0.000193006993006993, + "loss": 0.0269, + "step": 270 + }, + { + "epoch": 0.07250675631138356, + "grad_norm": 1.546875, + "learning_rate": 0.00019287504947882307, + "loss": 0.0565, + "step": 275 + }, + { + "epoch": 0.07382506097159053, + "grad_norm": 0.5078125, + "learning_rate": 0.0001927431059506531, + "loss": 0.0942, + "step": 280 + }, + { + "epoch": 0.0751433656317975, + "grad_norm": 0.392578125, + "learning_rate": 0.00019261116242248318, + "loss": 0.0061, + "step": 285 + }, + { + "epoch": 0.07646167029200449, + "grad_norm": 1.9140625, + "learning_rate": 0.00019247921889431325, + "loss": 0.0497, + "step": 290 + }, + { + "epoch": 0.07777997495221145, + "grad_norm": 0.08837890625, + "learning_rate": 0.0001923472753661433, + "loss": 0.0573, + "step": 295 + }, + { + "epoch": 0.07909827961241843, + "grad_norm": 1.046875, + "learning_rate": 0.00019221533183797336, + "loss": 0.0528, + "step": 300 + }, + { + "epoch": 0.08041658427262541, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001920833883098034, + "loss": 0.0506, + "step": 305 + }, + { + "epoch": 0.08173488893283237, + "grad_norm": 0.08203125, + "learning_rate": 0.00019195144478163347, + "loss": 0.0307, + "step": 310 + }, + { + "epoch": 0.08305319359303935, + "grad_norm": 0.111328125, + "learning_rate": 0.00019181950125346354, + "loss": 0.0365, + "step": 315 + }, + { + "epoch": 0.08437149825324633, + "grad_norm": 1.2890625, + "learning_rate": 0.00019168755772529358, + "loss": 0.0447, + "step": 320 + }, + { + "epoch": 0.0856898029134533, + "grad_norm": 0.6015625, + "learning_rate": 0.00019155561419712365, + "loss": 0.0605, + "step": 325 + }, + { + "epoch": 0.08700810757366027, + "grad_norm": 0.71875, + "learning_rate": 0.0001914236706689537, + "loss": 0.0846, + "step": 330 + }, + { + "epoch": 0.08832641223386725, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019129172714078376, + "loss": 0.0713, + "step": 335 + }, + { + "epoch": 0.08964471689407422, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001911597836126138, + "loss": 0.0826, + "step": 340 + }, + { + "epoch": 0.0909630215542812, + "grad_norm": 2.203125, + "learning_rate": 0.00019102784008444388, + "loss": 0.0441, + "step": 345 + }, + { + "epoch": 0.09228132621448817, + "grad_norm": 1.21875, + "learning_rate": 0.00019089589655627395, + "loss": 0.1378, + "step": 350 + }, + { + "epoch": 0.09359963087469514, + "grad_norm": 3.0625, + "learning_rate": 0.00019076395302810396, + "loss": 0.1552, + "step": 355 + }, + { + "epoch": 0.09491793553490212, + "grad_norm": 0.232421875, + "learning_rate": 0.00019063200949993403, + "loss": 0.0458, + "step": 360 + }, + { + "epoch": 0.0962362401951091, + "grad_norm": 0.71875, + "learning_rate": 0.0001905000659717641, + "loss": 0.0312, + "step": 365 + }, + { + "epoch": 0.09755454485531606, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00019036812244359414, + "loss": 0.0247, + "step": 370 + }, + { + "epoch": 0.09887284951552304, + "grad_norm": 0.064453125, + "learning_rate": 0.0001902361789154242, + "loss": 0.054, + "step": 375 + }, + { + "epoch": 0.10019115417573, + "grad_norm": 0.021240234375, + "learning_rate": 0.00019010423538725425, + "loss": 0.0023, + "step": 380 + }, + { + "epoch": 0.10150945883593698, + "grad_norm": 0.0361328125, + "learning_rate": 0.00018997229185908432, + "loss": 0.0884, + "step": 385 + }, + { + "epoch": 0.10282776349614396, + "grad_norm": 1.703125, + "learning_rate": 0.00018984034833091436, + "loss": 0.0506, + "step": 390 + }, + { + "epoch": 0.10414606815635093, + "grad_norm": 0.08837890625, + "learning_rate": 0.00018970840480274443, + "loss": 0.1123, + "step": 395 + }, + { + "epoch": 0.1054643728165579, + "grad_norm": 0.6953125, + "learning_rate": 0.0001895764612745745, + "loss": 0.0597, + "step": 400 + }, + { + "epoch": 0.10678267747676488, + "grad_norm": 0.18359375, + "learning_rate": 0.00018944451774640454, + "loss": 0.0138, + "step": 405 + }, + { + "epoch": 0.10810098213697185, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0001893125742182346, + "loss": 0.0249, + "step": 410 + }, + { + "epoch": 0.10941928679717883, + "grad_norm": 0.00970458984375, + "learning_rate": 0.00018918063069006466, + "loss": 0.0084, + "step": 415 + }, + { + "epoch": 0.1107375914573858, + "grad_norm": 0.54296875, + "learning_rate": 0.00018904868716189472, + "loss": 0.0541, + "step": 420 + }, + { + "epoch": 0.11205589611759277, + "grad_norm": 0.74609375, + "learning_rate": 0.00018891674363372477, + "loss": 0.007, + "step": 425 + }, + { + "epoch": 0.11337420077779975, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00018878480010555484, + "loss": 0.0875, + "step": 430 + }, + { + "epoch": 0.11469250543800673, + "grad_norm": 0.9296875, + "learning_rate": 0.0001886528565773849, + "loss": 0.1207, + "step": 435 + }, + { + "epoch": 0.11601081009821369, + "grad_norm": 1.2734375, + "learning_rate": 0.00018852091304921495, + "loss": 0.1143, + "step": 440 + }, + { + "epoch": 0.11732911475842067, + "grad_norm": 0.6484375, + "learning_rate": 0.00018838896952104502, + "loss": 0.0393, + "step": 445 + }, + { + "epoch": 0.11864741941862765, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018825702599287506, + "loss": 0.02, + "step": 450 + }, + { + "epoch": 0.11996572407883462, + "grad_norm": 0.486328125, + "learning_rate": 0.0001881250824647051, + "loss": 0.0891, + "step": 455 + }, + { + "epoch": 0.1212840287390416, + "grad_norm": 1.0, + "learning_rate": 0.00018799313893653517, + "loss": 0.0469, + "step": 460 + }, + { + "epoch": 0.12260233339924857, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001878611954083652, + "loss": 0.019, + "step": 465 + }, + { + "epoch": 0.12392063805945554, + "grad_norm": 0.03857421875, + "learning_rate": 0.00018772925188019528, + "loss": 0.007, + "step": 470 + }, + { + "epoch": 0.12523894271966252, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00018759730835202532, + "loss": 0.0039, + "step": 475 + }, + { + "epoch": 0.12655724737986948, + "grad_norm": 0.014404296875, + "learning_rate": 0.0001874653648238554, + "loss": 0.0043, + "step": 480 + }, + { + "epoch": 0.12787555204007647, + "grad_norm": 0.51953125, + "learning_rate": 0.00018733342129568546, + "loss": 0.1326, + "step": 485 + }, + { + "epoch": 0.12919385670028344, + "grad_norm": 0.99609375, + "learning_rate": 0.0001872014777675155, + "loss": 0.0369, + "step": 490 + }, + { + "epoch": 0.1305121613604904, + "grad_norm": 0.2734375, + "learning_rate": 0.00018706953423934557, + "loss": 0.0395, + "step": 495 + }, + { + "epoch": 0.1318304660206974, + "grad_norm": 0.083984375, + "learning_rate": 0.00018693759071117561, + "loss": 0.0284, + "step": 500 + }, + { + "epoch": 0.1318304660206974, + "eval_loss": 0.04542969539761543, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.5293, + "eval_samples_per_second": 7.37, + "eval_steps_per_second": 3.685, + "step": 500 + }, + { + "epoch": 0.13314877068090436, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00018680564718300568, + "loss": 0.0533, + "step": 505 + }, + { + "epoch": 0.13446707534111133, + "grad_norm": 0.71484375, + "learning_rate": 0.00018667370365483575, + "loss": 0.0183, + "step": 510 + }, + { + "epoch": 0.13578538000131832, + "grad_norm": 0.018798828125, + "learning_rate": 0.0001865417601266658, + "loss": 0.0473, + "step": 515 + }, + { + "epoch": 0.13710368466152528, + "grad_norm": 0.388671875, + "learning_rate": 0.00018640981659849586, + "loss": 0.0562, + "step": 520 + }, + { + "epoch": 0.13842198932173225, + "grad_norm": 0.77734375, + "learning_rate": 0.0001862778730703259, + "loss": 0.0755, + "step": 525 + }, + { + "epoch": 0.13974029398193924, + "grad_norm": 2.8125, + "learning_rate": 0.00018614592954215598, + "loss": 0.0422, + "step": 530 + }, + { + "epoch": 0.1410585986421462, + "grad_norm": 0.48828125, + "learning_rate": 0.00018601398601398602, + "loss": 0.0882, + "step": 535 + }, + { + "epoch": 0.14237690330235317, + "grad_norm": 0.16015625, + "learning_rate": 0.0001858820424858161, + "loss": 0.0131, + "step": 540 + }, + { + "epoch": 0.14369520796256013, + "grad_norm": 0.31640625, + "learning_rate": 0.00018575009895764616, + "loss": 0.03, + "step": 545 + }, + { + "epoch": 0.14501351262276713, + "grad_norm": 0.0120849609375, + "learning_rate": 0.0001856181554294762, + "loss": 0.0425, + "step": 550 + }, + { + "epoch": 0.1463318172829741, + "grad_norm": 0.390625, + "learning_rate": 0.00018548621190130624, + "loss": 0.011, + "step": 555 + }, + { + "epoch": 0.14765012194318106, + "grad_norm": 1.9609375, + "learning_rate": 0.0001853542683731363, + "loss": 0.0807, + "step": 560 + }, + { + "epoch": 0.14896842660338805, + "grad_norm": 0.609375, + "learning_rate": 0.00018522232484496635, + "loss": 0.0278, + "step": 565 + }, + { + "epoch": 0.150286731263595, + "grad_norm": 0.087890625, + "learning_rate": 0.00018509038131679642, + "loss": 0.0484, + "step": 570 + }, + { + "epoch": 0.15160503592380198, + "grad_norm": 0.5078125, + "learning_rate": 0.00018495843778862646, + "loss": 0.1277, + "step": 575 + }, + { + "epoch": 0.15292334058400897, + "grad_norm": 0.8125, + "learning_rate": 0.00018482649426045653, + "loss": 0.058, + "step": 580 + }, + { + "epoch": 0.15424164524421594, + "grad_norm": 0.22265625, + "learning_rate": 0.00018469455073228657, + "loss": 0.0259, + "step": 585 + }, + { + "epoch": 0.1555599499044229, + "grad_norm": 1.8984375, + "learning_rate": 0.00018456260720411664, + "loss": 0.113, + "step": 590 + }, + { + "epoch": 0.1568782545646299, + "grad_norm": 0.12451171875, + "learning_rate": 0.0001844306636759467, + "loss": 0.0312, + "step": 595 + }, + { + "epoch": 0.15819655922483686, + "grad_norm": 0.0322265625, + "learning_rate": 0.00018429872014777676, + "loss": 0.0476, + "step": 600 + }, + { + "epoch": 0.15951486388504382, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00018416677661960682, + "loss": 0.0232, + "step": 605 + }, + { + "epoch": 0.16083316854525082, + "grad_norm": 0.57421875, + "learning_rate": 0.00018403483309143687, + "loss": 0.1287, + "step": 610 + }, + { + "epoch": 0.16215147320545778, + "grad_norm": 0.765625, + "learning_rate": 0.00018390288956326694, + "loss": 0.0991, + "step": 615 + }, + { + "epoch": 0.16346977786566474, + "grad_norm": 0.3125, + "learning_rate": 0.00018377094603509698, + "loss": 0.0247, + "step": 620 + }, + { + "epoch": 0.16478808252587174, + "grad_norm": 0.37890625, + "learning_rate": 0.00018363900250692705, + "loss": 0.0632, + "step": 625 + }, + { + "epoch": 0.1661063871860787, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018350705897875712, + "loss": 0.0314, + "step": 630 + }, + { + "epoch": 0.16742469184628567, + "grad_norm": 0.0673828125, + "learning_rate": 0.00018337511545058716, + "loss": 0.0425, + "step": 635 + }, + { + "epoch": 0.16874299650649266, + "grad_norm": 0.396484375, + "learning_rate": 0.00018324317192241723, + "loss": 0.0613, + "step": 640 + }, + { + "epoch": 0.17006130116669962, + "grad_norm": 0.057373046875, + "learning_rate": 0.00018311122839424727, + "loss": 0.0569, + "step": 645 + }, + { + "epoch": 0.1713796058269066, + "grad_norm": 0.001373291015625, + "learning_rate": 0.00018297928486607734, + "loss": 0.007, + "step": 650 + }, + { + "epoch": 0.17269791048711358, + "grad_norm": 1.0859375, + "learning_rate": 0.00018284734133790738, + "loss": 0.0189, + "step": 655 + }, + { + "epoch": 0.17401621514732055, + "grad_norm": 0.6015625, + "learning_rate": 0.00018271539780973742, + "loss": 0.0601, + "step": 660 + }, + { + "epoch": 0.1753345198075275, + "grad_norm": 0.25390625, + "learning_rate": 0.0001825834542815675, + "loss": 0.0211, + "step": 665 + }, + { + "epoch": 0.1766528244677345, + "grad_norm": 2.6875, + "learning_rate": 0.00018245151075339753, + "loss": 0.0713, + "step": 670 + }, + { + "epoch": 0.17797112912794147, + "grad_norm": 1.1875, + "learning_rate": 0.0001823195672252276, + "loss": 0.0522, + "step": 675 + }, + { + "epoch": 0.17928943378814843, + "grad_norm": 0.025146484375, + "learning_rate": 0.00018218762369705767, + "loss": 0.0242, + "step": 680 + }, + { + "epoch": 0.18060773844835543, + "grad_norm": 0.048095703125, + "learning_rate": 0.00018205568016888772, + "loss": 0.0129, + "step": 685 + }, + { + "epoch": 0.1819260431085624, + "grad_norm": 0.04541015625, + "learning_rate": 0.00018192373664071778, + "loss": 0.0142, + "step": 690 + }, + { + "epoch": 0.18324434776876936, + "grad_norm": 0.00830078125, + "learning_rate": 0.00018179179311254783, + "loss": 0.0121, + "step": 695 + }, + { + "epoch": 0.18456265242897635, + "grad_norm": 0.53125, + "learning_rate": 0.0001816598495843779, + "loss": 0.0163, + "step": 700 + }, + { + "epoch": 0.1858809570891833, + "grad_norm": 0.185546875, + "learning_rate": 0.00018152790605620796, + "loss": 0.0203, + "step": 705 + }, + { + "epoch": 0.18719926174939028, + "grad_norm": 1.2578125, + "learning_rate": 0.000181395962528038, + "loss": 0.1548, + "step": 710 + }, + { + "epoch": 0.18851756640959727, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00018126401899986808, + "loss": 0.0543, + "step": 715 + }, + { + "epoch": 0.18983587106980424, + "grad_norm": 0.07568359375, + "learning_rate": 0.00018113207547169812, + "loss": 0.0346, + "step": 720 + }, + { + "epoch": 0.1911541757300112, + "grad_norm": 0.1318359375, + "learning_rate": 0.0001810001319435282, + "loss": 0.03, + "step": 725 + }, + { + "epoch": 0.1924724803902182, + "grad_norm": 0.1455078125, + "learning_rate": 0.00018086818841535823, + "loss": 0.0796, + "step": 730 + }, + { + "epoch": 0.19379078505042516, + "grad_norm": 0.09814453125, + "learning_rate": 0.0001807362448871883, + "loss": 0.0662, + "step": 735 + }, + { + "epoch": 0.19510908971063212, + "grad_norm": 0.91015625, + "learning_rate": 0.00018060430135901837, + "loss": 0.0675, + "step": 740 + }, + { + "epoch": 0.19642739437083911, + "grad_norm": 0.10693359375, + "learning_rate": 0.0001804723578308484, + "loss": 0.0377, + "step": 745 + }, + { + "epoch": 0.19774569903104608, + "grad_norm": 0.95703125, + "learning_rate": 0.00018034041430267848, + "loss": 0.0174, + "step": 750 + }, + { + "epoch": 0.19906400369125304, + "grad_norm": 1.7890625, + "learning_rate": 0.00018020847077450852, + "loss": 0.0278, + "step": 755 + }, + { + "epoch": 0.20038230835146, + "grad_norm": 0.8515625, + "learning_rate": 0.00018007652724633856, + "loss": 0.0113, + "step": 760 + }, + { + "epoch": 0.201700613011667, + "grad_norm": 0.016845703125, + "learning_rate": 0.00017994458371816863, + "loss": 0.0589, + "step": 765 + }, + { + "epoch": 0.20301891767187397, + "grad_norm": 0.01043701171875, + "learning_rate": 0.00017981264018999867, + "loss": 0.0203, + "step": 770 + }, + { + "epoch": 0.20433722233208093, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00017968069666182874, + "loss": 0.0494, + "step": 775 + }, + { + "epoch": 0.20565552699228792, + "grad_norm": 0.56640625, + "learning_rate": 0.00017954875313365879, + "loss": 0.0394, + "step": 780 + }, + { + "epoch": 0.2069738316524949, + "grad_norm": 0.06591796875, + "learning_rate": 0.00017941680960548886, + "loss": 0.0848, + "step": 785 + }, + { + "epoch": 0.20829213631270185, + "grad_norm": 0.40234375, + "learning_rate": 0.00017928486607731892, + "loss": 0.0464, + "step": 790 + }, + { + "epoch": 0.20961044097290885, + "grad_norm": 0.06298828125, + "learning_rate": 0.00017915292254914897, + "loss": 0.0222, + "step": 795 + }, + { + "epoch": 0.2109287456331158, + "grad_norm": 0.5390625, + "learning_rate": 0.00017902097902097904, + "loss": 0.0434, + "step": 800 + }, + { + "epoch": 0.21224705029332278, + "grad_norm": 1.390625, + "learning_rate": 0.00017888903549280908, + "loss": 0.0222, + "step": 805 + }, + { + "epoch": 0.21356535495352977, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00017875709196463915, + "loss": 0.0099, + "step": 810 + }, + { + "epoch": 0.21488365961373673, + "grad_norm": 0.10009765625, + "learning_rate": 0.0001786251484364692, + "loss": 0.0086, + "step": 815 + }, + { + "epoch": 0.2162019642739437, + "grad_norm": 0.06396484375, + "learning_rate": 0.00017849320490829926, + "loss": 0.0715, + "step": 820 + }, + { + "epoch": 0.2175202689341507, + "grad_norm": 0.365234375, + "learning_rate": 0.00017836126138012933, + "loss": 0.0642, + "step": 825 + }, + { + "epoch": 0.21883857359435765, + "grad_norm": 0.01519775390625, + "learning_rate": 0.00017822931785195937, + "loss": 0.0111, + "step": 830 + }, + { + "epoch": 0.22015687825456462, + "grad_norm": 1.1640625, + "learning_rate": 0.00017809737432378944, + "loss": 0.0518, + "step": 835 + }, + { + "epoch": 0.2214751829147716, + "grad_norm": 0.00921630859375, + "learning_rate": 0.00017796543079561948, + "loss": 0.0384, + "step": 840 + }, + { + "epoch": 0.22279348757497858, + "grad_norm": 0.33984375, + "learning_rate": 0.00017783348726744955, + "loss": 0.0204, + "step": 845 + }, + { + "epoch": 0.22411179223518554, + "grad_norm": 0.294921875, + "learning_rate": 0.00017770154373927962, + "loss": 0.0075, + "step": 850 + }, + { + "epoch": 0.22543009689539253, + "grad_norm": 0.033203125, + "learning_rate": 0.00017756960021110963, + "loss": 0.0895, + "step": 855 + }, + { + "epoch": 0.2267484015555995, + "grad_norm": 0.08056640625, + "learning_rate": 0.0001774376566829397, + "loss": 0.1039, + "step": 860 + }, + { + "epoch": 0.22806670621580646, + "grad_norm": 0.55078125, + "learning_rate": 0.00017730571315476975, + "loss": 0.0125, + "step": 865 + }, + { + "epoch": 0.22938501087601346, + "grad_norm": 0.5859375, + "learning_rate": 0.00017717376962659982, + "loss": 0.0381, + "step": 870 + }, + { + "epoch": 0.23070331553622042, + "grad_norm": 0.029052734375, + "learning_rate": 0.00017704182609842988, + "loss": 0.0434, + "step": 875 + }, + { + "epoch": 0.23202162019642739, + "grad_norm": 0.43359375, + "learning_rate": 0.00017690988257025993, + "loss": 0.0799, + "step": 880 + }, + { + "epoch": 0.23333992485663438, + "grad_norm": 0.04150390625, + "learning_rate": 0.00017677793904209, + "loss": 0.0692, + "step": 885 + }, + { + "epoch": 0.23465822951684134, + "grad_norm": 0.435546875, + "learning_rate": 0.00017664599551392004, + "loss": 0.0544, + "step": 890 + }, + { + "epoch": 0.2359765341770483, + "grad_norm": 1.171875, + "learning_rate": 0.0001765140519857501, + "loss": 0.0619, + "step": 895 + }, + { + "epoch": 0.2372948388372553, + "grad_norm": 0.01263427734375, + "learning_rate": 0.00017638210845758018, + "loss": 0.0418, + "step": 900 + }, + { + "epoch": 0.23861314349746227, + "grad_norm": 0.017578125, + "learning_rate": 0.00017625016492941022, + "loss": 0.0195, + "step": 905 + }, + { + "epoch": 0.23993144815766923, + "grad_norm": 0.6171875, + "learning_rate": 0.0001761182214012403, + "loss": 0.067, + "step": 910 + }, + { + "epoch": 0.24124975281787622, + "grad_norm": 0.59765625, + "learning_rate": 0.00017598627787307033, + "loss": 0.049, + "step": 915 + }, + { + "epoch": 0.2425680574780832, + "grad_norm": 1.2421875, + "learning_rate": 0.0001758543343449004, + "loss": 0.0539, + "step": 920 + }, + { + "epoch": 0.24388636213829015, + "grad_norm": 0.10302734375, + "learning_rate": 0.00017572239081673044, + "loss": 0.0725, + "step": 925 + }, + { + "epoch": 0.24520466679849715, + "grad_norm": 0.330078125, + "learning_rate": 0.0001755904472885605, + "loss": 0.064, + "step": 930 + }, + { + "epoch": 0.2465229714587041, + "grad_norm": 0.220703125, + "learning_rate": 0.00017545850376039058, + "loss": 0.0271, + "step": 935 + }, + { + "epoch": 0.24784127611891107, + "grad_norm": 0.01470947265625, + "learning_rate": 0.00017532656023222062, + "loss": 0.0247, + "step": 940 + }, + { + "epoch": 0.24915958077911807, + "grad_norm": 0.013427734375, + "learning_rate": 0.0001751946167040507, + "loss": 0.017, + "step": 945 + }, + { + "epoch": 0.25047788543932503, + "grad_norm": 0.58984375, + "learning_rate": 0.00017506267317588073, + "loss": 0.0254, + "step": 950 + }, + { + "epoch": 0.251796190099532, + "grad_norm": 0.412109375, + "learning_rate": 0.00017493072964771078, + "loss": 0.0186, + "step": 955 + }, + { + "epoch": 0.25311449475973896, + "grad_norm": 0.66796875, + "learning_rate": 0.00017479878611954084, + "loss": 0.0617, + "step": 960 + }, + { + "epoch": 0.25443279941994595, + "grad_norm": 0.322265625, + "learning_rate": 0.00017466684259137089, + "loss": 0.0173, + "step": 965 + }, + { + "epoch": 0.25575110408015295, + "grad_norm": 0.83203125, + "learning_rate": 0.00017453489906320096, + "loss": 0.0512, + "step": 970 + }, + { + "epoch": 0.2570694087403599, + "grad_norm": 0.08447265625, + "learning_rate": 0.000174402955535031, + "loss": 0.0361, + "step": 975 + }, + { + "epoch": 0.2583877134005669, + "grad_norm": 0.423828125, + "learning_rate": 0.00017427101200686107, + "loss": 0.0175, + "step": 980 + }, + { + "epoch": 0.25970601806077387, + "grad_norm": 0.77734375, + "learning_rate": 0.00017413906847869114, + "loss": 0.0139, + "step": 985 + }, + { + "epoch": 0.2610243227209808, + "grad_norm": 0.515625, + "learning_rate": 0.00017400712495052118, + "loss": 0.0948, + "step": 990 + }, + { + "epoch": 0.2623426273811878, + "grad_norm": 1.421875, + "learning_rate": 0.00017387518142235125, + "loss": 0.0406, + "step": 995 + }, + { + "epoch": 0.2636609320413948, + "grad_norm": 0.058837890625, + "learning_rate": 0.0001737432378941813, + "loss": 0.1011, + "step": 1000 + }, + { + "epoch": 0.2636609320413948, + "eval_loss": 0.045552924275398254, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.6113, + "eval_samples_per_second": 7.369, + "eval_steps_per_second": 3.684, + "step": 1000 + }, + { + "epoch": 0.26497923670160173, + "grad_norm": 0.380859375, + "learning_rate": 0.00017361129436601136, + "loss": 0.0711, + "step": 1005 + }, + { + "epoch": 0.2662975413618087, + "grad_norm": 0.0208740234375, + "learning_rate": 0.00017347935083784143, + "loss": 0.0218, + "step": 1010 + }, + { + "epoch": 0.2676158460220157, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017334740730967147, + "loss": 0.0301, + "step": 1015 + }, + { + "epoch": 0.26893415068222265, + "grad_norm": 0.2734375, + "learning_rate": 0.00017321546378150154, + "loss": 0.0721, + "step": 1020 + }, + { + "epoch": 0.27025245534242964, + "grad_norm": 0.25390625, + "learning_rate": 0.00017308352025333158, + "loss": 0.0363, + "step": 1025 + }, + { + "epoch": 0.27157076000263664, + "grad_norm": 0.04345703125, + "learning_rate": 0.00017295157672516165, + "loss": 0.0313, + "step": 1030 + }, + { + "epoch": 0.2728890646628436, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0001728196331969917, + "loss": 0.0385, + "step": 1035 + }, + { + "epoch": 0.27420736932305056, + "grad_norm": 0.00787353515625, + "learning_rate": 0.00017268768966882176, + "loss": 0.0405, + "step": 1040 + }, + { + "epoch": 0.27552567398325756, + "grad_norm": 0.484375, + "learning_rate": 0.00017255574614065183, + "loss": 0.0616, + "step": 1045 + }, + { + "epoch": 0.2768439786434645, + "grad_norm": 0.0908203125, + "learning_rate": 0.00017242380261248185, + "loss": 0.0057, + "step": 1050 + }, + { + "epoch": 0.2781622833036715, + "grad_norm": 0.1904296875, + "learning_rate": 0.00017229185908431192, + "loss": 0.0417, + "step": 1055 + }, + { + "epoch": 0.2794805879638785, + "grad_norm": 0.30078125, + "learning_rate": 0.00017215991555614196, + "loss": 0.0346, + "step": 1060 + }, + { + "epoch": 0.2807988926240854, + "grad_norm": 0.016357421875, + "learning_rate": 0.00017202797202797203, + "loss": 0.0295, + "step": 1065 + }, + { + "epoch": 0.2821171972842924, + "grad_norm": 0.490234375, + "learning_rate": 0.0001718960284998021, + "loss": 0.0448, + "step": 1070 + }, + { + "epoch": 0.28343550194449935, + "grad_norm": 0.004241943359375, + "learning_rate": 0.00017176408497163214, + "loss": 0.0051, + "step": 1075 + }, + { + "epoch": 0.28475380660470634, + "grad_norm": 0.01904296875, + "learning_rate": 0.0001716321414434622, + "loss": 0.0894, + "step": 1080 + }, + { + "epoch": 0.28607211126491333, + "grad_norm": 0.83984375, + "learning_rate": 0.00017150019791529225, + "loss": 0.0288, + "step": 1085 + }, + { + "epoch": 0.28739041592512027, + "grad_norm": 0.2021484375, + "learning_rate": 0.00017136825438712232, + "loss": 0.0222, + "step": 1090 + }, + { + "epoch": 0.28870872058532726, + "grad_norm": 0.322265625, + "learning_rate": 0.0001712363108589524, + "loss": 0.0444, + "step": 1095 + }, + { + "epoch": 0.29002702524553425, + "grad_norm": 0.408203125, + "learning_rate": 0.00017110436733078243, + "loss": 0.0828, + "step": 1100 + }, + { + "epoch": 0.2913453299057412, + "grad_norm": 0.04052734375, + "learning_rate": 0.0001709724238026125, + "loss": 0.0725, + "step": 1105 + }, + { + "epoch": 0.2926636345659482, + "grad_norm": 0.2578125, + "learning_rate": 0.00017084048027444254, + "loss": 0.0204, + "step": 1110 + }, + { + "epoch": 0.2939819392261552, + "grad_norm": 0.67578125, + "learning_rate": 0.0001707085367462726, + "loss": 0.0503, + "step": 1115 + }, + { + "epoch": 0.2953002438863621, + "grad_norm": 0.0059814453125, + "learning_rate": 0.00017057659321810265, + "loss": 0.0144, + "step": 1120 + }, + { + "epoch": 0.2966185485465691, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00017044464968993272, + "loss": 0.0044, + "step": 1125 + }, + { + "epoch": 0.2979368532067761, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001703127061617628, + "loss": 0.013, + "step": 1130 + }, + { + "epoch": 0.29925515786698303, + "grad_norm": 0.287109375, + "learning_rate": 0.00017018076263359283, + "loss": 0.0245, + "step": 1135 + }, + { + "epoch": 0.30057346252719, + "grad_norm": 0.26171875, + "learning_rate": 0.0001700488191054229, + "loss": 0.0247, + "step": 1140 + }, + { + "epoch": 0.301891767187397, + "grad_norm": 0.40625, + "learning_rate": 0.00016991687557725294, + "loss": 0.0402, + "step": 1145 + }, + { + "epoch": 0.30321007184760396, + "grad_norm": 1.2578125, + "learning_rate": 0.000169784932049083, + "loss": 0.0071, + "step": 1150 + }, + { + "epoch": 0.30452837650781095, + "grad_norm": 0.330078125, + "learning_rate": 0.00016965298852091306, + "loss": 0.0177, + "step": 1155 + }, + { + "epoch": 0.30584668116801794, + "grad_norm": 0.07275390625, + "learning_rate": 0.0001695210449927431, + "loss": 0.0029, + "step": 1160 + }, + { + "epoch": 0.3071649858282249, + "grad_norm": 0.455078125, + "learning_rate": 0.00016938910146457317, + "loss": 0.0262, + "step": 1165 + }, + { + "epoch": 0.30848329048843187, + "grad_norm": 0.002655029296875, + "learning_rate": 0.0001692571579364032, + "loss": 0.0346, + "step": 1170 + }, + { + "epoch": 0.30980159514863886, + "grad_norm": 0.1748046875, + "learning_rate": 0.00016912521440823328, + "loss": 0.0494, + "step": 1175 + }, + { + "epoch": 0.3111198998088458, + "grad_norm": 1.4609375, + "learning_rate": 0.00016899327088006335, + "loss": 0.0603, + "step": 1180 + }, + { + "epoch": 0.3124382044690528, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001688613273518934, + "loss": 0.0366, + "step": 1185 + }, + { + "epoch": 0.3137565091292598, + "grad_norm": 0.01422119140625, + "learning_rate": 0.00016872938382372346, + "loss": 0.0678, + "step": 1190 + }, + { + "epoch": 0.3150748137894667, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001685974402955535, + "loss": 0.0359, + "step": 1195 + }, + { + "epoch": 0.3163931184496737, + "grad_norm": 0.275390625, + "learning_rate": 0.00016846549676738357, + "loss": 0.1099, + "step": 1200 + }, + { + "epoch": 0.3177114231098807, + "grad_norm": 0.212890625, + "learning_rate": 0.00016833355323921364, + "loss": 0.0343, + "step": 1205 + }, + { + "epoch": 0.31902972777008765, + "grad_norm": 0.0302734375, + "learning_rate": 0.00016820160971104368, + "loss": 0.0138, + "step": 1210 + }, + { + "epoch": 0.32034803243029464, + "grad_norm": 0.016845703125, + "learning_rate": 0.00016806966618287375, + "loss": 0.0202, + "step": 1215 + }, + { + "epoch": 0.32166633709050163, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001679377226547038, + "loss": 0.0442, + "step": 1220 + }, + { + "epoch": 0.32298464175070857, + "grad_norm": 0.049072265625, + "learning_rate": 0.00016780577912653386, + "loss": 0.0375, + "step": 1225 + }, + { + "epoch": 0.32430294641091556, + "grad_norm": 0.1337890625, + "learning_rate": 0.0001676738355983639, + "loss": 0.01, + "step": 1230 + }, + { + "epoch": 0.32562125107112255, + "grad_norm": 0.02197265625, + "learning_rate": 0.00016754189207019397, + "loss": 0.0139, + "step": 1235 + }, + { + "epoch": 0.3269395557313295, + "grad_norm": 0.09228515625, + "learning_rate": 0.00016740994854202404, + "loss": 0.014, + "step": 1240 + }, + { + "epoch": 0.3282578603915365, + "grad_norm": 0.47265625, + "learning_rate": 0.00016727800501385408, + "loss": 0.1546, + "step": 1245 + }, + { + "epoch": 0.3295761650517435, + "grad_norm": 0.02294921875, + "learning_rate": 0.00016714606148568413, + "loss": 0.0803, + "step": 1250 + }, + { + "epoch": 0.3308944697119504, + "grad_norm": 0.185546875, + "learning_rate": 0.00016701411795751417, + "loss": 0.0376, + "step": 1255 + }, + { + "epoch": 0.3322127743721574, + "grad_norm": 0.1123046875, + "learning_rate": 0.00016688217442934424, + "loss": 0.0375, + "step": 1260 + }, + { + "epoch": 0.3335310790323644, + "grad_norm": 1.03125, + "learning_rate": 0.0001667502309011743, + "loss": 0.0442, + "step": 1265 + }, + { + "epoch": 0.33484938369257133, + "grad_norm": 0.0172119140625, + "learning_rate": 0.00016661828737300435, + "loss": 0.0261, + "step": 1270 + }, + { + "epoch": 0.3361676883527783, + "grad_norm": 0.42578125, + "learning_rate": 0.00016648634384483442, + "loss": 0.0553, + "step": 1275 + }, + { + "epoch": 0.3374859930129853, + "grad_norm": 0.1328125, + "learning_rate": 0.00016635440031666446, + "loss": 0.0065, + "step": 1280 + }, + { + "epoch": 0.33880429767319226, + "grad_norm": 0.263671875, + "learning_rate": 0.00016622245678849453, + "loss": 0.0527, + "step": 1285 + }, + { + "epoch": 0.34012260233339925, + "grad_norm": 0.314453125, + "learning_rate": 0.0001660905132603246, + "loss": 0.0297, + "step": 1290 + }, + { + "epoch": 0.34144090699360624, + "grad_norm": 0.04345703125, + "learning_rate": 0.00016595856973215464, + "loss": 0.0477, + "step": 1295 + }, + { + "epoch": 0.3427592116538132, + "grad_norm": 0.08154296875, + "learning_rate": 0.0001658266262039847, + "loss": 0.0298, + "step": 1300 + }, + { + "epoch": 0.34407751631402017, + "grad_norm": 0.08935546875, + "learning_rate": 0.00016569468267581475, + "loss": 0.0481, + "step": 1305 + }, + { + "epoch": 0.34539582097422716, + "grad_norm": 0.06640625, + "learning_rate": 0.00016556273914764482, + "loss": 0.0153, + "step": 1310 + }, + { + "epoch": 0.3467141256344341, + "grad_norm": 0.00592041015625, + "learning_rate": 0.00016543079561947486, + "loss": 0.0111, + "step": 1315 + }, + { + "epoch": 0.3480324302946411, + "grad_norm": 0.2236328125, + "learning_rate": 0.00016529885209130493, + "loss": 0.0309, + "step": 1320 + }, + { + "epoch": 0.3493507349548481, + "grad_norm": 0.0198974609375, + "learning_rate": 0.000165166908563135, + "loss": 0.0579, + "step": 1325 + }, + { + "epoch": 0.350669039615055, + "grad_norm": 0.10107421875, + "learning_rate": 0.00016503496503496504, + "loss": 0.0055, + "step": 1330 + }, + { + "epoch": 0.351987344275262, + "grad_norm": 0.71875, + "learning_rate": 0.00016490302150679511, + "loss": 0.0299, + "step": 1335 + }, + { + "epoch": 0.353305648935469, + "grad_norm": 0.01348876953125, + "learning_rate": 0.00016477107797862516, + "loss": 0.0943, + "step": 1340 + }, + { + "epoch": 0.35462395359567594, + "grad_norm": 0.3046875, + "learning_rate": 0.00016463913445045523, + "loss": 0.0216, + "step": 1345 + }, + { + "epoch": 0.35594225825588294, + "grad_norm": 0.02392578125, + "learning_rate": 0.00016450719092228527, + "loss": 0.0265, + "step": 1350 + }, + { + "epoch": 0.35726056291608993, + "grad_norm": 0.453125, + "learning_rate": 0.0001643752473941153, + "loss": 0.0539, + "step": 1355 + }, + { + "epoch": 0.35857886757629687, + "grad_norm": 0.00823974609375, + "learning_rate": 0.00016424330386594538, + "loss": 0.0139, + "step": 1360 + }, + { + "epoch": 0.35989717223650386, + "grad_norm": 0.55859375, + "learning_rate": 0.00016411136033777542, + "loss": 0.0428, + "step": 1365 + }, + { + "epoch": 0.36121547689671085, + "grad_norm": 0.052734375, + "learning_rate": 0.0001639794168096055, + "loss": 0.0346, + "step": 1370 + }, + { + "epoch": 0.3625337815569178, + "grad_norm": 0.12158203125, + "learning_rate": 0.00016384747328143556, + "loss": 0.0095, + "step": 1375 + }, + { + "epoch": 0.3638520862171248, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0001637155297532656, + "loss": 0.0224, + "step": 1380 + }, + { + "epoch": 0.3651703908773318, + "grad_norm": 0.01318359375, + "learning_rate": 0.00016358358622509567, + "loss": 0.0316, + "step": 1385 + }, + { + "epoch": 0.3664886955375387, + "grad_norm": 0.011962890625, + "learning_rate": 0.0001634516426969257, + "loss": 0.0051, + "step": 1390 + }, + { + "epoch": 0.3678070001977457, + "grad_norm": 0.00396728515625, + "learning_rate": 0.00016331969916875578, + "loss": 0.038, + "step": 1395 + }, + { + "epoch": 0.3691253048579527, + "grad_norm": 0.375, + "learning_rate": 0.00016318775564058585, + "loss": 0.029, + "step": 1400 + }, + { + "epoch": 0.37044360951815963, + "grad_norm": 0.265625, + "learning_rate": 0.0001630558121124159, + "loss": 0.0072, + "step": 1405 + }, + { + "epoch": 0.3717619141783666, + "grad_norm": 0.00127410888671875, + "learning_rate": 0.00016292386858424596, + "loss": 0.0381, + "step": 1410 + }, + { + "epoch": 0.3730802188385736, + "grad_norm": 1.15625, + "learning_rate": 0.000162791925056076, + "loss": 0.0573, + "step": 1415 + }, + { + "epoch": 0.37439852349878056, + "grad_norm": 0.0244140625, + "learning_rate": 0.00016265998152790607, + "loss": 0.051, + "step": 1420 + }, + { + "epoch": 0.37571682815898755, + "grad_norm": 0.0015106201171875, + "learning_rate": 0.00016252803799973612, + "loss": 0.0239, + "step": 1425 + }, + { + "epoch": 0.37703513281919454, + "grad_norm": 0.26953125, + "learning_rate": 0.00016239609447156618, + "loss": 0.0165, + "step": 1430 + }, + { + "epoch": 0.3783534374794015, + "grad_norm": 0.006134033203125, + "learning_rate": 0.00016226415094339625, + "loss": 0.0071, + "step": 1435 + }, + { + "epoch": 0.37967174213960847, + "grad_norm": 2.828125, + "learning_rate": 0.0001621322074152263, + "loss": 0.0272, + "step": 1440 + }, + { + "epoch": 0.38099004679981546, + "grad_norm": 0.349609375, + "learning_rate": 0.00016200026388705637, + "loss": 0.0647, + "step": 1445 + }, + { + "epoch": 0.3823083514600224, + "grad_norm": 0.09326171875, + "learning_rate": 0.00016186832035888638, + "loss": 0.0262, + "step": 1450 + }, + { + "epoch": 0.3836266561202294, + "grad_norm": 0.041015625, + "learning_rate": 0.00016173637683071645, + "loss": 0.0576, + "step": 1455 + }, + { + "epoch": 0.3849449607804364, + "grad_norm": 0.033935546875, + "learning_rate": 0.00016160443330254652, + "loss": 0.0142, + "step": 1460 + }, + { + "epoch": 0.3862632654406433, + "grad_norm": 0.09130859375, + "learning_rate": 0.00016147248977437656, + "loss": 0.0348, + "step": 1465 + }, + { + "epoch": 0.3875815701008503, + "grad_norm": 2.390625, + "learning_rate": 0.00016134054624620663, + "loss": 0.0672, + "step": 1470 + }, + { + "epoch": 0.3888998747610573, + "grad_norm": 0.439453125, + "learning_rate": 0.00016120860271803667, + "loss": 0.0121, + "step": 1475 + }, + { + "epoch": 0.39021817942126424, + "grad_norm": 0.1298828125, + "learning_rate": 0.00016107665918986674, + "loss": 0.0114, + "step": 1480 + }, + { + "epoch": 0.39153648408147124, + "grad_norm": 0.85546875, + "learning_rate": 0.0001609447156616968, + "loss": 0.0968, + "step": 1485 + }, + { + "epoch": 0.39285478874167823, + "grad_norm": 0.703125, + "learning_rate": 0.00016081277213352685, + "loss": 0.0349, + "step": 1490 + }, + { + "epoch": 0.39417309340188517, + "grad_norm": 0.021728515625, + "learning_rate": 0.00016068082860535692, + "loss": 0.0106, + "step": 1495 + }, + { + "epoch": 0.39549139806209216, + "grad_norm": 0.7265625, + "learning_rate": 0.00016054888507718696, + "loss": 0.0225, + "step": 1500 + }, + { + "epoch": 0.39549139806209216, + "eval_loss": 0.03515048325061798, + "eval_model_preparation_time": 0.0076, + "eval_runtime": 457.3497, + "eval_samples_per_second": 7.373, + "eval_steps_per_second": 3.686, + "step": 1500 + }, + { + "epoch": 0.3968097027222991, + "grad_norm": 0.016519820317626, + "learning_rate": 0.00016041694154901703, + "loss": 0.0202, + "step": 1505 + }, + { + "epoch": 0.3981280073825061, + "grad_norm": 0.8505942225456238, + "learning_rate": 0.00016028499802084708, + "loss": 0.0541, + "step": 1510 + }, + { + "epoch": 0.3994463120427131, + "grad_norm": 0.04163295030593872, + "learning_rate": 0.00016015305449267714, + "loss": 0.0037, + "step": 1515 + }, + { + "epoch": 0.40076461670292, + "grad_norm": 0.011332935653626919, + "learning_rate": 0.00016002111096450721, + "loss": 0.0459, + "step": 1520 + }, + { + "epoch": 0.402082921363127, + "grad_norm": 0.9360129833221436, + "learning_rate": 0.00015988916743633726, + "loss": 0.013, + "step": 1525 + }, + { + "epoch": 0.403401226023334, + "grad_norm": 0.11991436779499054, + "learning_rate": 0.00015975722390816733, + "loss": 0.0079, + "step": 1530 + }, + { + "epoch": 0.40471953068354094, + "grad_norm": 0.36911076307296753, + "learning_rate": 0.00015962528037999737, + "loss": 0.0638, + "step": 1535 + }, + { + "epoch": 0.40603783534374793, + "grad_norm": 0.020278634503483772, + "learning_rate": 0.00015949333685182744, + "loss": 0.0217, + "step": 1540 + }, + { + "epoch": 0.4073561400039549, + "grad_norm": 0.14263059198856354, + "learning_rate": 0.0001593613933236575, + "loss": 0.0495, + "step": 1545 + }, + { + "epoch": 0.40867444466416186, + "grad_norm": 0.09494803845882416, + "learning_rate": 0.00015922944979548752, + "loss": 0.0248, + "step": 1550 + }, + { + "epoch": 0.40999274932436885, + "grad_norm": 0.23064319789409637, + "learning_rate": 0.0001590975062673176, + "loss": 0.0285, + "step": 1555 + }, + { + "epoch": 0.41131105398457585, + "grad_norm": 0.32220256328582764, + "learning_rate": 0.00015896556273914763, + "loss": 0.0537, + "step": 1560 + }, + { + "epoch": 0.4126293586447828, + "grad_norm": 0.41208815574645996, + "learning_rate": 0.0001588336192109777, + "loss": 0.0453, + "step": 1565 + }, + { + "epoch": 0.4139476633049898, + "grad_norm": 0.03775424137711525, + "learning_rate": 0.00015870167568280777, + "loss": 0.0134, + "step": 1570 + }, + { + "epoch": 0.41526596796519677, + "grad_norm": 0.6526333093643188, + "learning_rate": 0.0001585697321546378, + "loss": 0.0329, + "step": 1575 + }, + { + "epoch": 0.4165842726254037, + "grad_norm": 1.001305103302002, + "learning_rate": 0.00015843778862646788, + "loss": 0.0912, + "step": 1580 + }, + { + "epoch": 0.4179025772856107, + "grad_norm": 0.4055219888687134, + "learning_rate": 0.00015830584509829792, + "loss": 0.0519, + "step": 1585 + }, + { + "epoch": 0.4192208819458177, + "grad_norm": 0.035015616565942764, + "learning_rate": 0.000158173901570128, + "loss": 0.0191, + "step": 1590 + }, + { + "epoch": 0.42053918660602463, + "grad_norm": 0.09326844662427902, + "learning_rate": 0.00015804195804195806, + "loss": 0.0106, + "step": 1595 + }, + { + "epoch": 0.4218574912662316, + "grad_norm": 0.06223440542817116, + "learning_rate": 0.0001579100145137881, + "loss": 0.0113, + "step": 1600 + }, + { + "epoch": 0.4231757959264386, + "grad_norm": 0.0625135526061058, + "learning_rate": 0.00015777807098561817, + "loss": 0.0191, + "step": 1605 + }, + { + "epoch": 0.42449410058664555, + "grad_norm": 0.2645983099937439, + "learning_rate": 0.00015764612745744822, + "loss": 0.0829, + "step": 1610 + }, + { + "epoch": 0.42581240524685254, + "grad_norm": 0.009632415138185024, + "learning_rate": 0.00015751418392927829, + "loss": 0.0542, + "step": 1615 + }, + { + "epoch": 0.42713070990705954, + "grad_norm": 0.01979319378733635, + "learning_rate": 0.00015738224040110833, + "loss": 0.0517, + "step": 1620 + }, + { + "epoch": 0.4284490145672665, + "grad_norm": 0.3065454065799713, + "learning_rate": 0.0001572502968729384, + "loss": 0.0738, + "step": 1625 + }, + { + "epoch": 0.42976731922747347, + "grad_norm": 0.09581473469734192, + "learning_rate": 0.00015711835334476847, + "loss": 0.0571, + "step": 1630 + }, + { + "epoch": 0.43108562388768046, + "grad_norm": 0.23746591806411743, + "learning_rate": 0.0001569864098165985, + "loss": 0.0128, + "step": 1635 + }, + { + "epoch": 0.4324039285478874, + "grad_norm": 0.936278760433197, + "learning_rate": 0.00015685446628842858, + "loss": 0.0665, + "step": 1640 + }, + { + "epoch": 0.4337222332080944, + "grad_norm": 0.18487441539764404, + "learning_rate": 0.00015672252276025862, + "loss": 0.0527, + "step": 1645 + }, + { + "epoch": 0.4350405378683014, + "grad_norm": 0.6980624794960022, + "learning_rate": 0.00015659057923208866, + "loss": 0.0613, + "step": 1650 + }, + { + "epoch": 0.4363588425285083, + "grad_norm": 0.4696301221847534, + "learning_rate": 0.00015645863570391873, + "loss": 0.0569, + "step": 1655 + }, + { + "epoch": 0.4376771471887153, + "grad_norm": 0.15083105862140656, + "learning_rate": 0.00015632669217574877, + "loss": 0.0394, + "step": 1660 + }, + { + "epoch": 0.4389954518489223, + "grad_norm": 0.44701239466667175, + "learning_rate": 0.00015619474864757884, + "loss": 0.0494, + "step": 1665 + }, + { + "epoch": 0.44031375650912924, + "grad_norm": 0.07418403029441833, + "learning_rate": 0.00015606280511940888, + "loss": 0.0291, + "step": 1670 + }, + { + "epoch": 0.44163206116933623, + "grad_norm": 0.02311861515045166, + "learning_rate": 0.00015593086159123895, + "loss": 0.0304, + "step": 1675 + }, + { + "epoch": 0.4429503658295432, + "grad_norm": 0.4416038990020752, + "learning_rate": 0.00015579891806306902, + "loss": 0.0176, + "step": 1680 + }, + { + "epoch": 0.44426867048975016, + "grad_norm": 0.5124915242195129, + "learning_rate": 0.00015566697453489906, + "loss": 0.0454, + "step": 1685 + }, + { + "epoch": 0.44558697514995715, + "grad_norm": 0.3159286081790924, + "learning_rate": 0.00015553503100672913, + "loss": 0.047, + "step": 1690 + }, + { + "epoch": 0.44690527981016415, + "grad_norm": 0.032126396894454956, + "learning_rate": 0.00015540308747855918, + "loss": 0.0151, + "step": 1695 + }, + { + "epoch": 0.4482235844703711, + "grad_norm": 0.04663548618555069, + "learning_rate": 0.00015527114395038924, + "loss": 0.0375, + "step": 1700 + }, + { + "epoch": 0.4495418891305781, + "grad_norm": 0.013753900304436684, + "learning_rate": 0.0001551392004222193, + "loss": 0.0485, + "step": 1705 + }, + { + "epoch": 0.45086019379078507, + "grad_norm": 1.9952393770217896, + "learning_rate": 0.00015500725689404936, + "loss": 0.0625, + "step": 1710 + }, + { + "epoch": 0.452178498450992, + "grad_norm": 0.014283270575106144, + "learning_rate": 0.00015487531336587943, + "loss": 0.0037, + "step": 1715 + }, + { + "epoch": 0.453496803111199, + "grad_norm": 0.3897913098335266, + "learning_rate": 0.00015474336983770947, + "loss": 0.0304, + "step": 1720 + }, + { + "epoch": 0.454815107771406, + "grad_norm": 0.3730885684490204, + "learning_rate": 0.00015461142630953954, + "loss": 0.0115, + "step": 1725 + }, + { + "epoch": 0.45613341243161293, + "grad_norm": 0.035858724266290665, + "learning_rate": 0.00015447948278136958, + "loss": 0.0021, + "step": 1730 + }, + { + "epoch": 0.4574517170918199, + "grad_norm": 0.20589517056941986, + "learning_rate": 0.00015434753925319965, + "loss": 0.0132, + "step": 1735 + }, + { + "epoch": 0.4587700217520269, + "grad_norm": 0.004939342383295298, + "learning_rate": 0.00015421559572502972, + "loss": 0.0471, + "step": 1740 + }, + { + "epoch": 0.46008832641223385, + "grad_norm": 0.03493283689022064, + "learning_rate": 0.00015408365219685976, + "loss": 0.0062, + "step": 1745 + }, + { + "epoch": 0.46140663107244084, + "grad_norm": 0.045927103608846664, + "learning_rate": 0.0001539517086686898, + "loss": 0.0283, + "step": 1750 + }, + { + "epoch": 0.46272493573264784, + "grad_norm": 0.012629454955458641, + "learning_rate": 0.00015381976514051984, + "loss": 0.0133, + "step": 1755 + }, + { + "epoch": 0.46404324039285477, + "grad_norm": 0.8001697659492493, + "learning_rate": 0.0001536878216123499, + "loss": 0.0224, + "step": 1760 + }, + { + "epoch": 0.46536154505306176, + "grad_norm": 0.002036362886428833, + "learning_rate": 0.00015355587808417998, + "loss": 0.0066, + "step": 1765 + }, + { + "epoch": 0.46667984971326876, + "grad_norm": 1.0261330604553223, + "learning_rate": 0.00015342393455601002, + "loss": 0.191, + "step": 1770 + }, + { + "epoch": 0.4679981543734757, + "grad_norm": 0.3033429682254791, + "learning_rate": 0.0001532919910278401, + "loss": 0.0222, + "step": 1775 + }, + { + "epoch": 0.4693164590336827, + "grad_norm": 0.36911338567733765, + "learning_rate": 0.00015316004749967014, + "loss": 0.0363, + "step": 1780 + }, + { + "epoch": 0.4706347636938897, + "grad_norm": 0.0406811460852623, + "learning_rate": 0.0001530281039715002, + "loss": 0.0283, + "step": 1785 + }, + { + "epoch": 0.4719530683540966, + "grad_norm": 0.23334211111068726, + "learning_rate": 0.00015289616044333027, + "loss": 0.0274, + "step": 1790 + }, + { + "epoch": 0.4732713730143036, + "grad_norm": 0.013081169687211514, + "learning_rate": 0.00015276421691516032, + "loss": 0.0221, + "step": 1795 + }, + { + "epoch": 0.4745896776745106, + "grad_norm": 0.2480790615081787, + "learning_rate": 0.00015263227338699039, + "loss": 0.019, + "step": 1800 + }, + { + "epoch": 0.47590798233471754, + "grad_norm": 0.0373196005821228, + "learning_rate": 0.00015250032985882043, + "loss": 0.0292, + "step": 1805 + }, + { + "epoch": 0.47722628699492453, + "grad_norm": 0.004609994124621153, + "learning_rate": 0.0001523683863306505, + "loss": 0.0918, + "step": 1810 + }, + { + "epoch": 0.4785445916551315, + "grad_norm": 0.02370987832546234, + "learning_rate": 0.00015223644280248054, + "loss": 0.0462, + "step": 1815 + }, + { + "epoch": 0.47986289631533846, + "grad_norm": 0.05842221528291702, + "learning_rate": 0.0001521044992743106, + "loss": 0.0595, + "step": 1820 + }, + { + "epoch": 0.48118120097554545, + "grad_norm": 0.009685276076197624, + "learning_rate": 0.00015197255574614068, + "loss": 0.0074, + "step": 1825 + }, + { + "epoch": 0.48249950563575245, + "grad_norm": 0.8933250308036804, + "learning_rate": 0.00015184061221797072, + "loss": 0.0757, + "step": 1830 + }, + { + "epoch": 0.4838178102959594, + "grad_norm": 0.07075401395559311, + "learning_rate": 0.0001517086686898008, + "loss": 0.0226, + "step": 1835 + }, + { + "epoch": 0.4851361149561664, + "grad_norm": 0.732706606388092, + "learning_rate": 0.00015157672516163083, + "loss": 0.0161, + "step": 1840 + }, + { + "epoch": 0.48645441961637337, + "grad_norm": 1.1897023916244507, + "learning_rate": 0.0001514447816334609, + "loss": 0.0265, + "step": 1845 + }, + { + "epoch": 0.4877727242765803, + "grad_norm": 0.052572328597307205, + "learning_rate": 0.00015131283810529094, + "loss": 0.0094, + "step": 1850 + }, + { + "epoch": 0.4890910289367873, + "grad_norm": 0.08263898640871048, + "learning_rate": 0.00015118089457712098, + "loss": 0.0631, + "step": 1855 + }, + { + "epoch": 0.4904093335969943, + "grad_norm": 0.03225664421916008, + "learning_rate": 0.00015104895104895105, + "loss": 0.023, + "step": 1860 + }, + { + "epoch": 0.4917276382572012, + "grad_norm": 0.007935039699077606, + "learning_rate": 0.0001509170075207811, + "loss": 0.0039, + "step": 1865 + }, + { + "epoch": 0.4930459429174082, + "grad_norm": 0.00830796267837286, + "learning_rate": 0.00015078506399261116, + "loss": 0.007, + "step": 1870 + }, + { + "epoch": 0.4943642475776152, + "grad_norm": 0.08042234182357788, + "learning_rate": 0.00015065312046444123, + "loss": 0.0366, + "step": 1875 + }, + { + "epoch": 0.49568255223782215, + "grad_norm": 0.009092851541936398, + "learning_rate": 0.00015052117693627128, + "loss": 0.0107, + "step": 1880 + }, + { + "epoch": 0.49700085689802914, + "grad_norm": 0.2674141824245453, + "learning_rate": 0.00015038923340810135, + "loss": 0.0076, + "step": 1885 + }, + { + "epoch": 0.49831916155823613, + "grad_norm": 0.07694366574287415, + "learning_rate": 0.0001502572898799314, + "loss": 0.0252, + "step": 1890 + }, + { + "epoch": 0.49963746621844307, + "grad_norm": 0.5699467062950134, + "learning_rate": 0.00015012534635176146, + "loss": 0.0487, + "step": 1895 + }, + { + "epoch": 0.5009557708786501, + "grad_norm": 0.18800878524780273, + "learning_rate": 0.0001499934028235915, + "loss": 0.0183, + "step": 1900 + }, + { + "epoch": 0.5022740755388571, + "grad_norm": 0.019469989463686943, + "learning_rate": 0.00014986145929542157, + "loss": 0.0268, + "step": 1905 + }, + { + "epoch": 0.503592380199064, + "grad_norm": 0.01890506222844124, + "learning_rate": 0.00014972951576725164, + "loss": 0.0449, + "step": 1910 + }, + { + "epoch": 0.5049106848592709, + "grad_norm": 0.0006314461352303624, + "learning_rate": 0.00014959757223908168, + "loss": 0.0056, + "step": 1915 + }, + { + "epoch": 0.5062289895194779, + "grad_norm": 0.32654041051864624, + "learning_rate": 0.00014946562871091175, + "loss": 0.0256, + "step": 1920 + }, + { + "epoch": 0.5075472941796849, + "grad_norm": 0.7803483605384827, + "learning_rate": 0.0001493336851827418, + "loss": 0.0374, + "step": 1925 + }, + { + "epoch": 0.5088655988398919, + "grad_norm": 0.028441445901989937, + "learning_rate": 0.00014920174165457186, + "loss": 0.0161, + "step": 1930 + }, + { + "epoch": 0.5101839035000989, + "grad_norm": 0.028379200026392937, + "learning_rate": 0.00014906979812640193, + "loss": 0.0151, + "step": 1935 + }, + { + "epoch": 0.5115022081603059, + "grad_norm": 0.021159596741199493, + "learning_rate": 0.00014893785459823197, + "loss": 0.0303, + "step": 1940 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.24903325736522675, + "learning_rate": 0.000148805911070062, + "loss": 0.0076, + "step": 1945 + }, + { + "epoch": 0.5141388174807198, + "grad_norm": 0.007065301761031151, + "learning_rate": 0.00014867396754189206, + "loss": 0.022, + "step": 1950 + }, + { + "epoch": 0.5154571221409268, + "grad_norm": 0.004032329190522432, + "learning_rate": 0.00014854202401372212, + "loss": 0.0083, + "step": 1955 + }, + { + "epoch": 0.5167754268011338, + "grad_norm": 0.3045775592327118, + "learning_rate": 0.0001484100804855522, + "loss": 0.0113, + "step": 1960 + }, + { + "epoch": 0.5180937314613407, + "grad_norm": 0.36974939703941345, + "learning_rate": 0.00014827813695738224, + "loss": 0.0267, + "step": 1965 + }, + { + "epoch": 0.5194120361215477, + "grad_norm": 0.009729950688779354, + "learning_rate": 0.0001481461934292123, + "loss": 0.027, + "step": 1970 + }, + { + "epoch": 0.5207303407817546, + "grad_norm": 0.0013097926275804639, + "learning_rate": 0.00014801424990104235, + "loss": 0.003, + "step": 1975 + }, + { + "epoch": 0.5220486454419616, + "grad_norm": 0.0706263929605484, + "learning_rate": 0.00014788230637287242, + "loss": 0.0193, + "step": 1980 + }, + { + "epoch": 0.5233669501021686, + "grad_norm": 1.435702919960022, + "learning_rate": 0.00014775036284470249, + "loss": 0.0647, + "step": 1985 + }, + { + "epoch": 0.5246852547623756, + "grad_norm": 0.00661757867783308, + "learning_rate": 0.00014761841931653253, + "loss": 0.0373, + "step": 1990 + }, + { + "epoch": 0.5260035594225826, + "grad_norm": 0.12014541029930115, + "learning_rate": 0.0001474864757883626, + "loss": 0.0178, + "step": 1995 + }, + { + "epoch": 0.5273218640827896, + "grad_norm": 1.0549248456954956, + "learning_rate": 0.00014735453226019264, + "loss": 0.0191, + "step": 2000 + }, + { + "epoch": 0.5273218640827896, + "eval_loss": 0.037292081862688065, + "eval_runtime": 454.3033, + "eval_samples_per_second": 7.422, + "eval_steps_per_second": 3.711, + "step": 2000 + }, + { + "epoch": 0.5286401687429965, + "grad_norm": 0.47634151577949524, + "learning_rate": 0.0001472225887320227, + "loss": 0.0404, + "step": 2005 + }, + { + "epoch": 0.5299584734032035, + "grad_norm": 0.006752463988959789, + "learning_rate": 0.00014709064520385275, + "loss": 0.034, + "step": 2010 + }, + { + "epoch": 0.5312767780634104, + "grad_norm": 0.20780125260353088, + "learning_rate": 0.00014695870167568282, + "loss": 0.0421, + "step": 2015 + }, + { + "epoch": 0.5325950827236174, + "grad_norm": 0.010941066779196262, + "learning_rate": 0.0001468267581475129, + "loss": 0.0086, + "step": 2020 + }, + { + "epoch": 0.5339133873838244, + "grad_norm": 0.3439581096172333, + "learning_rate": 0.00014669481461934293, + "loss": 0.0187, + "step": 2025 + }, + { + "epoch": 0.5352316920440314, + "grad_norm": 0.14961636066436768, + "learning_rate": 0.000146562871091173, + "loss": 0.0504, + "step": 2030 + }, + { + "epoch": 0.5365499967042383, + "grad_norm": 0.0044641937129199505, + "learning_rate": 0.00014643092756300304, + "loss": 0.0134, + "step": 2035 + }, + { + "epoch": 0.5378683013644453, + "grad_norm": 0.14088386297225952, + "learning_rate": 0.0001462989840348331, + "loss": 0.0096, + "step": 2040 + }, + { + "epoch": 0.5391866060246523, + "grad_norm": 0.48116979002952576, + "learning_rate": 0.00014616704050666315, + "loss": 0.0124, + "step": 2045 + }, + { + "epoch": 0.5405049106848593, + "grad_norm": 0.3688766360282898, + "learning_rate": 0.0001460350969784932, + "loss": 0.0226, + "step": 2050 + }, + { + "epoch": 0.5418232153450663, + "grad_norm": 0.002938181860372424, + "learning_rate": 0.00014590315345032326, + "loss": 0.0267, + "step": 2055 + }, + { + "epoch": 0.5431415200052733, + "grad_norm": 0.3335214853286743, + "learning_rate": 0.0001457712099221533, + "loss": 0.0367, + "step": 2060 + }, + { + "epoch": 0.5444598246654802, + "grad_norm": 0.004644686821848154, + "learning_rate": 0.00014563926639398338, + "loss": 0.0121, + "step": 2065 + }, + { + "epoch": 0.5457781293256871, + "grad_norm": 0.19505545496940613, + "learning_rate": 0.00014550732286581345, + "loss": 0.0591, + "step": 2070 + }, + { + "epoch": 0.5470964339858941, + "grad_norm": 0.018028756603598595, + "learning_rate": 0.0001453753793376435, + "loss": 0.0131, + "step": 2075 + }, + { + "epoch": 0.5484147386461011, + "grad_norm": 0.045639291405677795, + "learning_rate": 0.00014524343580947356, + "loss": 0.0443, + "step": 2080 + }, + { + "epoch": 0.5497330433063081, + "grad_norm": 0.727981686592102, + "learning_rate": 0.0001451114922813036, + "loss": 0.0205, + "step": 2085 + }, + { + "epoch": 0.5510513479665151, + "grad_norm": 0.03766491636633873, + "learning_rate": 0.00014497954875313367, + "loss": 0.0067, + "step": 2090 + }, + { + "epoch": 0.552369652626722, + "grad_norm": 0.1911504715681076, + "learning_rate": 0.0001448476052249637, + "loss": 0.0397, + "step": 2095 + }, + { + "epoch": 0.553687957286929, + "grad_norm": 0.08238353580236435, + "learning_rate": 0.00014471566169679378, + "loss": 0.0513, + "step": 2100 + }, + { + "epoch": 0.555006261947136, + "grad_norm": 0.06317206472158432, + "learning_rate": 0.00014458371816862385, + "loss": 0.0178, + "step": 2105 + }, + { + "epoch": 0.556324566607343, + "grad_norm": 0.0652734637260437, + "learning_rate": 0.0001444517746404539, + "loss": 0.0184, + "step": 2110 + }, + { + "epoch": 0.55764287126755, + "grad_norm": 0.05471858009696007, + "learning_rate": 0.00014431983111228396, + "loss": 0.0089, + "step": 2115 + }, + { + "epoch": 0.558961175927757, + "grad_norm": 0.005062670446932316, + "learning_rate": 0.000144187887584114, + "loss": 0.0052, + "step": 2120 + }, + { + "epoch": 0.5602794805879638, + "grad_norm": 0.06337414681911469, + "learning_rate": 0.00014405594405594407, + "loss": 0.053, + "step": 2125 + }, + { + "epoch": 0.5615977852481708, + "grad_norm": 0.33745357394218445, + "learning_rate": 0.00014392400052777414, + "loss": 0.0166, + "step": 2130 + }, + { + "epoch": 0.5629160899083778, + "grad_norm": 0.7382741570472717, + "learning_rate": 0.00014379205699960418, + "loss": 0.0191, + "step": 2135 + }, + { + "epoch": 0.5642343945685848, + "grad_norm": 0.007551972754299641, + "learning_rate": 0.00014366011347143425, + "loss": 0.0022, + "step": 2140 + }, + { + "epoch": 0.5655526992287918, + "grad_norm": 0.6260896921157837, + "learning_rate": 0.00014352816994326427, + "loss": 0.0095, + "step": 2145 + }, + { + "epoch": 0.5668710038889987, + "grad_norm": 0.11619322001934052, + "learning_rate": 0.00014339622641509434, + "loss": 0.015, + "step": 2150 + }, + { + "epoch": 0.5681893085492057, + "grad_norm": 1.1440670490264893, + "learning_rate": 0.0001432642828869244, + "loss": 0.1343, + "step": 2155 + }, + { + "epoch": 0.5695076132094127, + "grad_norm": 1.1793878078460693, + "learning_rate": 0.00014313233935875445, + "loss": 0.0968, + "step": 2160 + }, + { + "epoch": 0.5708259178696197, + "grad_norm": 0.6865736842155457, + "learning_rate": 0.00014300039583058452, + "loss": 0.0195, + "step": 2165 + }, + { + "epoch": 0.5721442225298267, + "grad_norm": 0.140816792845726, + "learning_rate": 0.00014286845230241456, + "loss": 0.0761, + "step": 2170 + }, + { + "epoch": 0.5734625271900337, + "grad_norm": 0.04071786254644394, + "learning_rate": 0.00014273650877424463, + "loss": 0.0193, + "step": 2175 + }, + { + "epoch": 0.5747808318502405, + "grad_norm": 0.044617727398872375, + "learning_rate": 0.0001426045652460747, + "loss": 0.0112, + "step": 2180 + }, + { + "epoch": 0.5760991365104475, + "grad_norm": 0.11001799255609512, + "learning_rate": 0.00014247262171790474, + "loss": 0.0039, + "step": 2185 + }, + { + "epoch": 0.5774174411706545, + "grad_norm": 0.0036315324250608683, + "learning_rate": 0.0001423406781897348, + "loss": 0.0038, + "step": 2190 + }, + { + "epoch": 0.5787357458308615, + "grad_norm": 0.9866570830345154, + "learning_rate": 0.00014220873466156485, + "loss": 0.025, + "step": 2195 + }, + { + "epoch": 0.5800540504910685, + "grad_norm": 0.023570384830236435, + "learning_rate": 0.00014207679113339492, + "loss": 0.0468, + "step": 2200 + }, + { + "epoch": 0.5813723551512755, + "grad_norm": 0.20010559260845184, + "learning_rate": 0.00014194484760522496, + "loss": 0.0198, + "step": 2205 + }, + { + "epoch": 0.5826906598114824, + "grad_norm": 0.06153270602226257, + "learning_rate": 0.00014181290407705503, + "loss": 0.0764, + "step": 2210 + }, + { + "epoch": 0.5840089644716894, + "grad_norm": 0.033162448555231094, + "learning_rate": 0.0001416809605488851, + "loss": 0.028, + "step": 2215 + }, + { + "epoch": 0.5853272691318964, + "grad_norm": 0.428382933139801, + "learning_rate": 0.00014154901702071514, + "loss": 0.0652, + "step": 2220 + }, + { + "epoch": 0.5866455737921034, + "grad_norm": 0.25004762411117554, + "learning_rate": 0.0001414170734925452, + "loss": 0.0411, + "step": 2225 + }, + { + "epoch": 0.5879638784523104, + "grad_norm": 0.22649863362312317, + "learning_rate": 0.00014128512996437525, + "loss": 0.0517, + "step": 2230 + }, + { + "epoch": 0.5892821831125173, + "grad_norm": 0.035932112485170364, + "learning_rate": 0.00014115318643620532, + "loss": 0.015, + "step": 2235 + }, + { + "epoch": 0.5906004877727242, + "grad_norm": 0.3800172507762909, + "learning_rate": 0.00014102124290803536, + "loss": 0.0324, + "step": 2240 + }, + { + "epoch": 0.5919187924329312, + "grad_norm": 0.6974118947982788, + "learning_rate": 0.0001408892993798654, + "loss": 0.0216, + "step": 2245 + }, + { + "epoch": 0.5932370970931382, + "grad_norm": 0.15472032129764557, + "learning_rate": 0.00014075735585169548, + "loss": 0.0164, + "step": 2250 + }, + { + "epoch": 0.5945554017533452, + "grad_norm": 0.015000814571976662, + "learning_rate": 0.00014062541232352552, + "loss": 0.0395, + "step": 2255 + }, + { + "epoch": 0.5958737064135522, + "grad_norm": 0.052086081355810165, + "learning_rate": 0.0001404934687953556, + "loss": 0.0032, + "step": 2260 + }, + { + "epoch": 0.5971920110737592, + "grad_norm": 0.004600350745022297, + "learning_rate": 0.00014036152526718566, + "loss": 0.0056, + "step": 2265 + }, + { + "epoch": 0.5985103157339661, + "grad_norm": 0.4940958321094513, + "learning_rate": 0.0001402295817390157, + "loss": 0.0206, + "step": 2270 + }, + { + "epoch": 0.5998286203941731, + "grad_norm": 0.09658394008874893, + "learning_rate": 0.00014009763821084577, + "loss": 0.0052, + "step": 2275 + }, + { + "epoch": 0.60114692505438, + "grad_norm": 0.00020539117394946516, + "learning_rate": 0.0001399656946826758, + "loss": 0.087, + "step": 2280 + }, + { + "epoch": 0.602465229714587, + "grad_norm": 0.1871018409729004, + "learning_rate": 0.00013983375115450588, + "loss": 0.0812, + "step": 2285 + }, + { + "epoch": 0.603783534374794, + "grad_norm": 0.02583954855799675, + "learning_rate": 0.00013970180762633592, + "loss": 0.0232, + "step": 2290 + }, + { + "epoch": 0.605101839035001, + "grad_norm": 1.2103784084320068, + "learning_rate": 0.000139569864098166, + "loss": 0.0151, + "step": 2295 + }, + { + "epoch": 0.6064201436952079, + "grad_norm": 0.023514943197369576, + "learning_rate": 0.00013943792056999606, + "loss": 0.0193, + "step": 2300 + }, + { + "epoch": 0.6077384483554149, + "grad_norm": 0.0076395305804908276, + "learning_rate": 0.0001393059770418261, + "loss": 0.0379, + "step": 2305 + }, + { + "epoch": 0.6090567530156219, + "grad_norm": 0.12412039190530777, + "learning_rate": 0.00013917403351365617, + "loss": 0.0095, + "step": 2310 + }, + { + "epoch": 0.6103750576758289, + "grad_norm": 0.021904783323407173, + "learning_rate": 0.0001390420899854862, + "loss": 0.0166, + "step": 2315 + }, + { + "epoch": 0.6116933623360359, + "grad_norm": 0.004012851510196924, + "learning_rate": 0.00013891014645731628, + "loss": 0.0103, + "step": 2320 + }, + { + "epoch": 0.6130116669962429, + "grad_norm": 0.007267913781106472, + "learning_rate": 0.00013877820292914635, + "loss": 0.0708, + "step": 2325 + }, + { + "epoch": 0.6143299716564498, + "grad_norm": 0.10363642126321793, + "learning_rate": 0.0001386462594009764, + "loss": 0.0473, + "step": 2330 + }, + { + "epoch": 0.6156482763166568, + "grad_norm": 0.04899830371141434, + "learning_rate": 0.00013851431587280646, + "loss": 0.0283, + "step": 2335 + }, + { + "epoch": 0.6169665809768637, + "grad_norm": 0.39460498094558716, + "learning_rate": 0.0001383823723446365, + "loss": 0.0597, + "step": 2340 + }, + { + "epoch": 0.6182848856370707, + "grad_norm": 0.04092290997505188, + "learning_rate": 0.00013825042881646655, + "loss": 0.0167, + "step": 2345 + }, + { + "epoch": 0.6196031902972777, + "grad_norm": 0.2781132161617279, + "learning_rate": 0.00013811848528829662, + "loss": 0.0097, + "step": 2350 + }, + { + "epoch": 0.6209214949574847, + "grad_norm": 0.041443537920713425, + "learning_rate": 0.00013798654176012666, + "loss": 0.0226, + "step": 2355 + }, + { + "epoch": 0.6222397996176916, + "grad_norm": 0.1242462694644928, + "learning_rate": 0.00013785459823195673, + "loss": 0.0055, + "step": 2360 + }, + { + "epoch": 0.6235581042778986, + "grad_norm": 0.4440467357635498, + "learning_rate": 0.00013772265470378677, + "loss": 0.049, + "step": 2365 + }, + { + "epoch": 0.6248764089381056, + "grad_norm": 0.014354427345097065, + "learning_rate": 0.00013759071117561684, + "loss": 0.0327, + "step": 2370 + }, + { + "epoch": 0.6261947135983126, + "grad_norm": 0.011539973318576813, + "learning_rate": 0.0001374587676474469, + "loss": 0.0222, + "step": 2375 + }, + { + "epoch": 0.6275130182585196, + "grad_norm": 0.23539051413536072, + "learning_rate": 0.00013732682411927695, + "loss": 0.0816, + "step": 2380 + }, + { + "epoch": 0.6288313229187266, + "grad_norm": 0.26793941855430603, + "learning_rate": 0.00013719488059110702, + "loss": 0.0325, + "step": 2385 + }, + { + "epoch": 0.6301496275789334, + "grad_norm": 0.01662217453122139, + "learning_rate": 0.00013706293706293706, + "loss": 0.0221, + "step": 2390 + }, + { + "epoch": 0.6314679322391404, + "grad_norm": 0.30669671297073364, + "learning_rate": 0.00013693099353476713, + "loss": 0.026, + "step": 2395 + }, + { + "epoch": 0.6327862368993474, + "grad_norm": 0.03350894898176193, + "learning_rate": 0.00013679905000659717, + "loss": 0.0072, + "step": 2400 + }, + { + "epoch": 0.6341045415595544, + "grad_norm": 0.014983875676989555, + "learning_rate": 0.00013666710647842724, + "loss": 0.049, + "step": 2405 + }, + { + "epoch": 0.6354228462197614, + "grad_norm": 1.8989384174346924, + "learning_rate": 0.0001365351629502573, + "loss": 0.0335, + "step": 2410 + }, + { + "epoch": 0.6367411508799684, + "grad_norm": 0.030135562643408775, + "learning_rate": 0.00013640321942208735, + "loss": 0.0051, + "step": 2415 + }, + { + "epoch": 0.6380594555401753, + "grad_norm": 0.02079075388610363, + "learning_rate": 0.00013627127589391742, + "loss": 0.0138, + "step": 2420 + }, + { + "epoch": 0.6393777602003823, + "grad_norm": 0.06065403297543526, + "learning_rate": 0.00013613933236574746, + "loss": 0.0357, + "step": 2425 + }, + { + "epoch": 0.6406960648605893, + "grad_norm": 0.2980937659740448, + "learning_rate": 0.00013600738883757753, + "loss": 0.0138, + "step": 2430 + }, + { + "epoch": 0.6420143695207963, + "grad_norm": 0.4820438623428345, + "learning_rate": 0.00013587544530940758, + "loss": 0.01, + "step": 2435 + }, + { + "epoch": 0.6433326741810033, + "grad_norm": 0.005618259310722351, + "learning_rate": 0.00013574350178123765, + "loss": 0.0052, + "step": 2440 + }, + { + "epoch": 0.6446509788412103, + "grad_norm": 0.7173821926116943, + "learning_rate": 0.0001356115582530677, + "loss": 0.0133, + "step": 2445 + }, + { + "epoch": 0.6459692835014171, + "grad_norm": 0.0053142281249165535, + "learning_rate": 0.00013547961472489773, + "loss": 0.0045, + "step": 2450 + }, + { + "epoch": 0.6472875881616241, + "grad_norm": 0.06118829548358917, + "learning_rate": 0.0001353476711967278, + "loss": 0.056, + "step": 2455 + }, + { + "epoch": 0.6486058928218311, + "grad_norm": 3.5878078937530518, + "learning_rate": 0.00013521572766855787, + "loss": 0.0232, + "step": 2460 + }, + { + "epoch": 0.6499241974820381, + "grad_norm": 0.004911276511847973, + "learning_rate": 0.0001350837841403879, + "loss": 0.0074, + "step": 2465 + }, + { + "epoch": 0.6512425021422451, + "grad_norm": 0.0028026222717016935, + "learning_rate": 0.00013495184061221798, + "loss": 0.0782, + "step": 2470 + }, + { + "epoch": 0.6525608068024521, + "grad_norm": 0.7317615747451782, + "learning_rate": 0.00013481989708404802, + "loss": 0.0222, + "step": 2475 + }, + { + "epoch": 0.653879111462659, + "grad_norm": 0.01835751160979271, + "learning_rate": 0.0001346879535558781, + "loss": 0.0661, + "step": 2480 + }, + { + "epoch": 0.655197416122866, + "grad_norm": 0.03598962351679802, + "learning_rate": 0.00013455601002770813, + "loss": 0.0395, + "step": 2485 + }, + { + "epoch": 0.656515720783073, + "grad_norm": 0.013886351138353348, + "learning_rate": 0.0001344240664995382, + "loss": 0.0156, + "step": 2490 + }, + { + "epoch": 0.65783402544328, + "grad_norm": 5.741530895233154, + "learning_rate": 0.00013429212297136827, + "loss": 0.0317, + "step": 2495 + }, + { + "epoch": 0.659152330103487, + "grad_norm": 0.20793496072292328, + "learning_rate": 0.0001341601794431983, + "loss": 0.0072, + "step": 2500 + }, + { + "epoch": 0.659152330103487, + "eval_loss": 0.0300898440182209, + "eval_runtime": 453.0554, + "eval_samples_per_second": 7.443, + "eval_steps_per_second": 3.721, + "step": 2500 + }, + { + "epoch": 0.6604706347636939, + "grad_norm": 0.03460961952805519, + "learning_rate": 0.00013402823591502838, + "loss": 0.0097, + "step": 2505 + }, + { + "epoch": 0.6617889394239008, + "grad_norm": 0.31785696744918823, + "learning_rate": 0.00013389629238685842, + "loss": 0.0303, + "step": 2510 + }, + { + "epoch": 0.6631072440841078, + "grad_norm": 0.4273851215839386, + "learning_rate": 0.0001337643488586885, + "loss": 0.0499, + "step": 2515 + }, + { + "epoch": 0.6644255487443148, + "grad_norm": 0.02236153744161129, + "learning_rate": 0.00013363240533051856, + "loss": 0.0069, + "step": 2520 + }, + { + "epoch": 0.6657438534045218, + "grad_norm": 0.1592864990234375, + "learning_rate": 0.0001335004618023486, + "loss": 0.0326, + "step": 2525 + }, + { + "epoch": 0.6670621580647288, + "grad_norm": 0.029961545020341873, + "learning_rate": 0.00013336851827417867, + "loss": 0.0178, + "step": 2530 + }, + { + "epoch": 0.6683804627249358, + "grad_norm": 0.03120764158666134, + "learning_rate": 0.00013323657474600872, + "loss": 0.115, + "step": 2535 + }, + { + "epoch": 0.6696987673851427, + "grad_norm": 0.01060028001666069, + "learning_rate": 0.00013310463121783879, + "loss": 0.0036, + "step": 2540 + }, + { + "epoch": 0.6710170720453497, + "grad_norm": 0.053470809012651443, + "learning_rate": 0.00013297268768966883, + "loss": 0.0079, + "step": 2545 + }, + { + "epoch": 0.6723353767055567, + "grad_norm": 0.022777097299695015, + "learning_rate": 0.00013284074416149887, + "loss": 0.0078, + "step": 2550 + }, + { + "epoch": 0.6736536813657636, + "grad_norm": 0.0548521913588047, + "learning_rate": 0.00013270880063332894, + "loss": 0.0503, + "step": 2555 + }, + { + "epoch": 0.6749719860259706, + "grad_norm": 0.02028457075357437, + "learning_rate": 0.00013257685710515898, + "loss": 0.0096, + "step": 2560 + }, + { + "epoch": 0.6762902906861776, + "grad_norm": 0.01569107361137867, + "learning_rate": 0.00013244491357698905, + "loss": 0.008, + "step": 2565 + }, + { + "epoch": 0.6776085953463845, + "grad_norm": 0.00743742985650897, + "learning_rate": 0.00013231297004881912, + "loss": 0.005, + "step": 2570 + }, + { + "epoch": 0.6789269000065915, + "grad_norm": 0.025164416059851646, + "learning_rate": 0.00013218102652064916, + "loss": 0.018, + "step": 2575 + }, + { + "epoch": 0.6802452046667985, + "grad_norm": 0.3653188645839691, + "learning_rate": 0.00013204908299247923, + "loss": 0.0295, + "step": 2580 + }, + { + "epoch": 0.6815635093270055, + "grad_norm": 0.685422956943512, + "learning_rate": 0.00013191713946430927, + "loss": 0.0335, + "step": 2585 + }, + { + "epoch": 0.6828818139872125, + "grad_norm": 0.675740122795105, + "learning_rate": 0.00013178519593613934, + "loss": 0.0592, + "step": 2590 + }, + { + "epoch": 0.6842001186474194, + "grad_norm": 0.10513252764940262, + "learning_rate": 0.00013165325240796938, + "loss": 0.0353, + "step": 2595 + }, + { + "epoch": 0.6855184233076264, + "grad_norm": 0.43512973189353943, + "learning_rate": 0.00013152130887979945, + "loss": 0.0142, + "step": 2600 + }, + { + "epoch": 0.6868367279678333, + "grad_norm": 0.029436839744448662, + "learning_rate": 0.00013138936535162952, + "loss": 0.0042, + "step": 2605 + }, + { + "epoch": 0.6881550326280403, + "grad_norm": 0.5607122778892517, + "learning_rate": 0.00013125742182345957, + "loss": 0.0184, + "step": 2610 + }, + { + "epoch": 0.6894733372882473, + "grad_norm": 0.11365406215190887, + "learning_rate": 0.00013112547829528963, + "loss": 0.006, + "step": 2615 + }, + { + "epoch": 0.6907916419484543, + "grad_norm": 0.047227244824171066, + "learning_rate": 0.00013099353476711968, + "loss": 0.008, + "step": 2620 + }, + { + "epoch": 0.6921099466086612, + "grad_norm": 0.0005877618095837533, + "learning_rate": 0.00013086159123894975, + "loss": 0.0286, + "step": 2625 + }, + { + "epoch": 0.6934282512688682, + "grad_norm": 0.010759112425148487, + "learning_rate": 0.0001307296477107798, + "loss": 0.0062, + "step": 2630 + }, + { + "epoch": 0.6947465559290752, + "grad_norm": 0.07117745280265808, + "learning_rate": 0.00013059770418260986, + "loss": 0.0891, + "step": 2635 + }, + { + "epoch": 0.6960648605892822, + "grad_norm": 0.0639057606458664, + "learning_rate": 0.00013046576065443993, + "loss": 0.0072, + "step": 2640 + }, + { + "epoch": 0.6973831652494892, + "grad_norm": 0.027350090444087982, + "learning_rate": 0.00013033381712626994, + "loss": 0.0103, + "step": 2645 + }, + { + "epoch": 0.6987014699096962, + "grad_norm": 0.015336195938289165, + "learning_rate": 0.0001302018735981, + "loss": 0.0041, + "step": 2650 + }, + { + "epoch": 0.700019774569903, + "grad_norm": 1.0650830268859863, + "learning_rate": 0.00013006993006993008, + "loss": 0.0443, + "step": 2655 + }, + { + "epoch": 0.70133807923011, + "grad_norm": 0.019073212519288063, + "learning_rate": 0.00012993798654176012, + "loss": 0.0331, + "step": 2660 + }, + { + "epoch": 0.702656383890317, + "grad_norm": 0.10109209269285202, + "learning_rate": 0.0001298060430135902, + "loss": 0.0054, + "step": 2665 + }, + { + "epoch": 0.703974688550524, + "grad_norm": 0.03528957813978195, + "learning_rate": 0.00012967409948542023, + "loss": 0.0427, + "step": 2670 + }, + { + "epoch": 0.705292993210731, + "grad_norm": 0.03577788919210434, + "learning_rate": 0.0001295421559572503, + "loss": 0.023, + "step": 2675 + }, + { + "epoch": 0.706611297870938, + "grad_norm": 0.5576180815696716, + "learning_rate": 0.00012941021242908034, + "loss": 0.0416, + "step": 2680 + }, + { + "epoch": 0.7079296025311449, + "grad_norm": 0.017131298780441284, + "learning_rate": 0.0001292782689009104, + "loss": 0.0235, + "step": 2685 + }, + { + "epoch": 0.7092479071913519, + "grad_norm": 0.8517888784408569, + "learning_rate": 0.00012914632537274048, + "loss": 0.0168, + "step": 2690 + }, + { + "epoch": 0.7105662118515589, + "grad_norm": 0.23812156915664673, + "learning_rate": 0.00012901438184457052, + "loss": 0.0483, + "step": 2695 + }, + { + "epoch": 0.7118845165117659, + "grad_norm": 0.11746613681316376, + "learning_rate": 0.0001288824383164006, + "loss": 0.0255, + "step": 2700 + }, + { + "epoch": 0.7132028211719729, + "grad_norm": 0.20089928805828094, + "learning_rate": 0.00012875049478823064, + "loss": 0.0267, + "step": 2705 + }, + { + "epoch": 0.7145211258321799, + "grad_norm": 0.8301129937171936, + "learning_rate": 0.0001286185512600607, + "loss": 0.016, + "step": 2710 + }, + { + "epoch": 0.7158394304923867, + "grad_norm": 0.01838674768805504, + "learning_rate": 0.00012848660773189077, + "loss": 0.0229, + "step": 2715 + }, + { + "epoch": 0.7171577351525937, + "grad_norm": 0.03670337051153183, + "learning_rate": 0.00012835466420372082, + "loss": 0.038, + "step": 2720 + }, + { + "epoch": 0.7184760398128007, + "grad_norm": 0.0452633760869503, + "learning_rate": 0.00012822272067555089, + "loss": 0.0622, + "step": 2725 + }, + { + "epoch": 0.7197943444730077, + "grad_norm": 0.09503110498189926, + "learning_rate": 0.00012809077714738093, + "loss": 0.0209, + "step": 2730 + }, + { + "epoch": 0.7211126491332147, + "grad_norm": 1.0327308177947998, + "learning_rate": 0.000127958833619211, + "loss": 0.0361, + "step": 2735 + }, + { + "epoch": 0.7224309537934217, + "grad_norm": 1.0049290657043457, + "learning_rate": 0.00012782689009104104, + "loss": 0.0365, + "step": 2740 + }, + { + "epoch": 0.7237492584536286, + "grad_norm": 0.029774073511362076, + "learning_rate": 0.00012769494656287108, + "loss": 0.0257, + "step": 2745 + }, + { + "epoch": 0.7250675631138356, + "grad_norm": 0.20974040031433105, + "learning_rate": 0.00012756300303470115, + "loss": 0.0542, + "step": 2750 + }, + { + "epoch": 0.7263858677740426, + "grad_norm": 0.8153854608535767, + "learning_rate": 0.0001274310595065312, + "loss": 0.0216, + "step": 2755 + }, + { + "epoch": 0.7277041724342496, + "grad_norm": 0.4393698573112488, + "learning_rate": 0.00012729911597836126, + "loss": 0.0451, + "step": 2760 + }, + { + "epoch": 0.7290224770944566, + "grad_norm": 0.06990349292755127, + "learning_rate": 0.00012716717245019133, + "loss": 0.03, + "step": 2765 + }, + { + "epoch": 0.7303407817546635, + "grad_norm": 0.32689470052719116, + "learning_rate": 0.00012703522892202137, + "loss": 0.0263, + "step": 2770 + }, + { + "epoch": 0.7316590864148704, + "grad_norm": 0.026600876823067665, + "learning_rate": 0.00012690328539385144, + "loss": 0.0404, + "step": 2775 + }, + { + "epoch": 0.7329773910750774, + "grad_norm": 0.11228257417678833, + "learning_rate": 0.00012677134186568148, + "loss": 0.0224, + "step": 2780 + }, + { + "epoch": 0.7342956957352844, + "grad_norm": 0.6469443440437317, + "learning_rate": 0.00012663939833751155, + "loss": 0.0178, + "step": 2785 + }, + { + "epoch": 0.7356140003954914, + "grad_norm": 0.020773250609636307, + "learning_rate": 0.0001265074548093416, + "loss": 0.011, + "step": 2790 + }, + { + "epoch": 0.7369323050556984, + "grad_norm": 0.7378728985786438, + "learning_rate": 0.00012637551128117167, + "loss": 0.0227, + "step": 2795 + }, + { + "epoch": 0.7382506097159054, + "grad_norm": 0.008189595304429531, + "learning_rate": 0.00012624356775300173, + "loss": 0.0892, + "step": 2800 + }, + { + "epoch": 0.7395689143761123, + "grad_norm": 0.031633853912353516, + "learning_rate": 0.00012611162422483178, + "loss": 0.0093, + "step": 2805 + }, + { + "epoch": 0.7408872190363193, + "grad_norm": 0.5078475475311279, + "learning_rate": 0.00012597968069666185, + "loss": 0.0567, + "step": 2810 + }, + { + "epoch": 0.7422055236965263, + "grad_norm": 0.21766887605190277, + "learning_rate": 0.0001258477371684919, + "loss": 0.0485, + "step": 2815 + }, + { + "epoch": 0.7435238283567333, + "grad_norm": 0.3029612898826599, + "learning_rate": 0.00012571579364032196, + "loss": 0.032, + "step": 2820 + }, + { + "epoch": 0.7448421330169402, + "grad_norm": 1.2135159969329834, + "learning_rate": 0.00012558385011215203, + "loss": 0.0139, + "step": 2825 + }, + { + "epoch": 0.7461604376771472, + "grad_norm": 0.016875172033905983, + "learning_rate": 0.00012545190658398207, + "loss": 0.0323, + "step": 2830 + }, + { + "epoch": 0.7474787423373541, + "grad_norm": 0.08923230320215225, + "learning_rate": 0.00012531996305581214, + "loss": 0.0343, + "step": 2835 + }, + { + "epoch": 0.7487970469975611, + "grad_norm": 0.2958766520023346, + "learning_rate": 0.00012518801952764215, + "loss": 0.0431, + "step": 2840 + }, + { + "epoch": 0.7501153516577681, + "grad_norm": 0.7344386577606201, + "learning_rate": 0.00012505607599947222, + "loss": 0.0389, + "step": 2845 + }, + { + "epoch": 0.7514336563179751, + "grad_norm": 0.03681635856628418, + "learning_rate": 0.0001249241324713023, + "loss": 0.0258, + "step": 2850 + }, + { + "epoch": 0.7527519609781821, + "grad_norm": 0.22866861522197723, + "learning_rate": 0.00012479218894313233, + "loss": 0.0223, + "step": 2855 + }, + { + "epoch": 0.7540702656383891, + "grad_norm": 0.029770435765385628, + "learning_rate": 0.0001246602454149624, + "loss": 0.0205, + "step": 2860 + }, + { + "epoch": 0.755388570298596, + "grad_norm": 0.011845707893371582, + "learning_rate": 0.00012452830188679244, + "loss": 0.0252, + "step": 2865 + }, + { + "epoch": 0.756706874958803, + "grad_norm": 0.06696149706840515, + "learning_rate": 0.00012439635835862251, + "loss": 0.0166, + "step": 2870 + }, + { + "epoch": 0.75802517961901, + "grad_norm": 0.01653144136071205, + "learning_rate": 0.00012426441483045256, + "loss": 0.0487, + "step": 2875 + }, + { + "epoch": 0.7593434842792169, + "grad_norm": 0.031312476843595505, + "learning_rate": 0.00012413247130228263, + "loss": 0.0155, + "step": 2880 + }, + { + "epoch": 0.7606617889394239, + "grad_norm": 0.011625733226537704, + "learning_rate": 0.0001240005277741127, + "loss": 0.0333, + "step": 2885 + }, + { + "epoch": 0.7619800935996309, + "grad_norm": 0.012089414522051811, + "learning_rate": 0.00012386858424594274, + "loss": 0.003, + "step": 2890 + }, + { + "epoch": 0.7632983982598378, + "grad_norm": 0.3012307584285736, + "learning_rate": 0.0001237366407177728, + "loss": 0.0172, + "step": 2895 + }, + { + "epoch": 0.7646167029200448, + "grad_norm": 0.31575000286102295, + "learning_rate": 0.00012360469718960285, + "loss": 0.0409, + "step": 2900 + }, + { + "epoch": 0.7659350075802518, + "grad_norm": 0.009794364683330059, + "learning_rate": 0.00012347275366143292, + "loss": 0.0214, + "step": 2905 + }, + { + "epoch": 0.7672533122404588, + "grad_norm": 0.5973085165023804, + "learning_rate": 0.00012334081013326299, + "loss": 0.0245, + "step": 2910 + }, + { + "epoch": 0.7685716169006658, + "grad_norm": 0.019750040024518967, + "learning_rate": 0.00012320886660509303, + "loss": 0.0063, + "step": 2915 + }, + { + "epoch": 0.7698899215608728, + "grad_norm": 0.06402858346700668, + "learning_rate": 0.0001230769230769231, + "loss": 0.0444, + "step": 2920 + }, + { + "epoch": 0.7712082262210797, + "grad_norm": 0.02876671403646469, + "learning_rate": 0.00012294497954875314, + "loss": 0.0103, + "step": 2925 + }, + { + "epoch": 0.7725265308812866, + "grad_norm": 0.6962207555770874, + "learning_rate": 0.0001228130360205832, + "loss": 0.0318, + "step": 2930 + }, + { + "epoch": 0.7738448355414936, + "grad_norm": 0.006536522414535284, + "learning_rate": 0.00012268109249241325, + "loss": 0.0096, + "step": 2935 + }, + { + "epoch": 0.7751631402017006, + "grad_norm": 0.07097168266773224, + "learning_rate": 0.0001225491489642433, + "loss": 0.0174, + "step": 2940 + }, + { + "epoch": 0.7764814448619076, + "grad_norm": 0.042360126972198486, + "learning_rate": 0.00012241720543607336, + "loss": 0.0158, + "step": 2945 + }, + { + "epoch": 0.7777997495221146, + "grad_norm": 0.01159572321921587, + "learning_rate": 0.0001222852619079034, + "loss": 0.0265, + "step": 2950 + }, + { + "epoch": 0.7791180541823215, + "grad_norm": 0.38408163189888, + "learning_rate": 0.00012215331837973347, + "loss": 0.0233, + "step": 2955 + }, + { + "epoch": 0.7804363588425285, + "grad_norm": 0.15588605403900146, + "learning_rate": 0.00012202137485156353, + "loss": 0.0041, + "step": 2960 + }, + { + "epoch": 0.7817546635027355, + "grad_norm": 0.006892362609505653, + "learning_rate": 0.00012188943132339358, + "loss": 0.0026, + "step": 2965 + }, + { + "epoch": 0.7830729681629425, + "grad_norm": 0.030915727838873863, + "learning_rate": 0.00012175748779522364, + "loss": 0.0028, + "step": 2970 + }, + { + "epoch": 0.7843912728231495, + "grad_norm": 0.8151025772094727, + "learning_rate": 0.00012162554426705371, + "loss": 0.0429, + "step": 2975 + }, + { + "epoch": 0.7857095774833565, + "grad_norm": 0.6765475273132324, + "learning_rate": 0.00012149360073888377, + "loss": 0.0319, + "step": 2980 + }, + { + "epoch": 0.7870278821435633, + "grad_norm": 0.054469238966703415, + "learning_rate": 0.00012136165721071382, + "loss": 0.0413, + "step": 2985 + }, + { + "epoch": 0.7883461868037703, + "grad_norm": 0.045610666275024414, + "learning_rate": 0.00012122971368254388, + "loss": 0.0521, + "step": 2990 + }, + { + "epoch": 0.7896644914639773, + "grad_norm": 0.4222470223903656, + "learning_rate": 0.00012109777015437393, + "loss": 0.0846, + "step": 2995 + }, + { + "epoch": 0.7909827961241843, + "grad_norm": 0.0272397268563509, + "learning_rate": 0.00012096582662620399, + "loss": 0.0364, + "step": 3000 + }, + { + "epoch": 0.7909827961241843, + "eval_loss": 0.033312585204839706, + "eval_runtime": 452.2552, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 3.728, + "step": 3000 + }, + { + "epoch": 0.7923011007843913, + "grad_norm": 0.08674059063196182, + "learning_rate": 0.00012083388309803406, + "loss": 0.0081, + "step": 3005 + }, + { + "epoch": 0.7936194054445982, + "grad_norm": 0.21960832178592682, + "learning_rate": 0.00012070193956986411, + "loss": 0.0468, + "step": 3010 + }, + { + "epoch": 0.7949377101048052, + "grad_norm": 0.11259289085865021, + "learning_rate": 0.00012056999604169417, + "loss": 0.0124, + "step": 3015 + }, + { + "epoch": 0.7962560147650122, + "grad_norm": 0.02945362776517868, + "learning_rate": 0.00012043805251352422, + "loss": 0.0298, + "step": 3020 + }, + { + "epoch": 0.7975743194252192, + "grad_norm": 0.27889615297317505, + "learning_rate": 0.00012030610898535428, + "loss": 0.0251, + "step": 3025 + }, + { + "epoch": 0.7988926240854262, + "grad_norm": 0.05873241275548935, + "learning_rate": 0.00012017416545718434, + "loss": 0.0132, + "step": 3030 + }, + { + "epoch": 0.8002109287456332, + "grad_norm": 0.1570046991109848, + "learning_rate": 0.00012004222192901439, + "loss": 0.0228, + "step": 3035 + }, + { + "epoch": 0.80152923340584, + "grad_norm": 0.12575332820415497, + "learning_rate": 0.00011991027840084443, + "loss": 0.0049, + "step": 3040 + }, + { + "epoch": 0.802847538066047, + "grad_norm": 0.8416435122489929, + "learning_rate": 0.00011977833487267449, + "loss": 0.0542, + "step": 3045 + }, + { + "epoch": 0.804165842726254, + "grad_norm": 0.2605098485946655, + "learning_rate": 0.00011964639134450454, + "loss": 0.0084, + "step": 3050 + }, + { + "epoch": 0.805484147386461, + "grad_norm": 0.8996294736862183, + "learning_rate": 0.00011951444781633461, + "loss": 0.0442, + "step": 3055 + }, + { + "epoch": 0.806802452046668, + "grad_norm": 2.7525105476379395, + "learning_rate": 0.00011938250428816467, + "loss": 0.0642, + "step": 3060 + }, + { + "epoch": 0.808120756706875, + "grad_norm": 0.14955930411815643, + "learning_rate": 0.00011925056075999473, + "loss": 0.0384, + "step": 3065 + }, + { + "epoch": 0.8094390613670819, + "grad_norm": 0.018756115809082985, + "learning_rate": 0.00011911861723182478, + "loss": 0.0154, + "step": 3070 + }, + { + "epoch": 0.8107573660272889, + "grad_norm": 0.23998615145683289, + "learning_rate": 0.00011898667370365484, + "loss": 0.0413, + "step": 3075 + }, + { + "epoch": 0.8120756706874959, + "grad_norm": 0.27253249287605286, + "learning_rate": 0.00011885473017548489, + "loss": 0.0081, + "step": 3080 + }, + { + "epoch": 0.8133939753477029, + "grad_norm": 0.2925993502140045, + "learning_rate": 0.00011872278664731495, + "loss": 0.0332, + "step": 3085 + }, + { + "epoch": 0.8147122800079099, + "grad_norm": 0.5364832878112793, + "learning_rate": 0.00011859084311914502, + "loss": 0.0143, + "step": 3090 + }, + { + "epoch": 0.8160305846681168, + "grad_norm": 0.32104921340942383, + "learning_rate": 0.00011845889959097507, + "loss": 0.0216, + "step": 3095 + }, + { + "epoch": 0.8173488893283237, + "grad_norm": 0.0205856766551733, + "learning_rate": 0.00011832695606280513, + "loss": 0.0346, + "step": 3100 + }, + { + "epoch": 0.8186671939885307, + "grad_norm": 0.2541547417640686, + "learning_rate": 0.00011819501253463518, + "loss": 0.0793, + "step": 3105 + }, + { + "epoch": 0.8199854986487377, + "grad_norm": 0.08333491533994675, + "learning_rate": 0.00011806306900646524, + "loss": 0.0049, + "step": 3110 + }, + { + "epoch": 0.8213038033089447, + "grad_norm": 0.0355968177318573, + "learning_rate": 0.0001179311254782953, + "loss": 0.0051, + "step": 3115 + }, + { + "epoch": 0.8226221079691517, + "grad_norm": 0.06948401033878326, + "learning_rate": 0.00011779918195012536, + "loss": 0.013, + "step": 3120 + }, + { + "epoch": 0.8239404126293587, + "grad_norm": 0.03328891843557358, + "learning_rate": 0.00011766723842195542, + "loss": 0.0122, + "step": 3125 + }, + { + "epoch": 0.8252587172895656, + "grad_norm": 0.013782350346446037, + "learning_rate": 0.00011753529489378548, + "loss": 0.0073, + "step": 3130 + }, + { + "epoch": 0.8265770219497726, + "grad_norm": 0.024390392005443573, + "learning_rate": 0.00011740335136561553, + "loss": 0.0143, + "step": 3135 + }, + { + "epoch": 0.8278953266099796, + "grad_norm": 0.002548128366470337, + "learning_rate": 0.00011727140783744557, + "loss": 0.0027, + "step": 3140 + }, + { + "epoch": 0.8292136312701865, + "grad_norm": 0.11674848943948746, + "learning_rate": 0.00011713946430927563, + "loss": 0.0253, + "step": 3145 + }, + { + "epoch": 0.8305319359303935, + "grad_norm": 0.005774884019047022, + "learning_rate": 0.00011700752078110568, + "loss": 0.0018, + "step": 3150 + }, + { + "epoch": 0.8318502405906005, + "grad_norm": 0.5763069987297058, + "learning_rate": 0.00011687557725293574, + "loss": 0.0119, + "step": 3155 + }, + { + "epoch": 0.8331685452508074, + "grad_norm": 0.0027607593219727278, + "learning_rate": 0.0001167436337247658, + "loss": 0.0279, + "step": 3160 + }, + { + "epoch": 0.8344868499110144, + "grad_norm": 1.859642505645752, + "learning_rate": 0.00011661169019659585, + "loss": 0.0228, + "step": 3165 + }, + { + "epoch": 0.8358051545712214, + "grad_norm": 0.16597022116184235, + "learning_rate": 0.00011647974666842592, + "loss": 0.1228, + "step": 3170 + }, + { + "epoch": 0.8371234592314284, + "grad_norm": 0.33833742141723633, + "learning_rate": 0.00011634780314025598, + "loss": 0.073, + "step": 3175 + }, + { + "epoch": 0.8384417638916354, + "grad_norm": 0.024682912975549698, + "learning_rate": 0.00011621585961208603, + "loss": 0.0042, + "step": 3180 + }, + { + "epoch": 0.8397600685518424, + "grad_norm": 0.05926942452788353, + "learning_rate": 0.00011608391608391609, + "loss": 0.0066, + "step": 3185 + }, + { + "epoch": 0.8410783732120493, + "grad_norm": 0.1414029747247696, + "learning_rate": 0.00011595197255574614, + "loss": 0.0603, + "step": 3190 + }, + { + "epoch": 0.8423966778722562, + "grad_norm": 0.37928736209869385, + "learning_rate": 0.0001158200290275762, + "loss": 0.0266, + "step": 3195 + }, + { + "epoch": 0.8437149825324632, + "grad_norm": 0.018329354003071785, + "learning_rate": 0.00011568808549940627, + "loss": 0.0047, + "step": 3200 + }, + { + "epoch": 0.8450332871926702, + "grad_norm": 0.2993735373020172, + "learning_rate": 0.00011555614197123632, + "loss": 0.0218, + "step": 3205 + }, + { + "epoch": 0.8463515918528772, + "grad_norm": 0.1767728328704834, + "learning_rate": 0.00011542419844306638, + "loss": 0.0363, + "step": 3210 + }, + { + "epoch": 0.8476698965130842, + "grad_norm": 0.39774414896965027, + "learning_rate": 0.00011529225491489644, + "loss": 0.0506, + "step": 3215 + }, + { + "epoch": 0.8489882011732911, + "grad_norm": 0.021896762773394585, + "learning_rate": 0.00011516031138672649, + "loss": 0.0081, + "step": 3220 + }, + { + "epoch": 0.8503065058334981, + "grad_norm": 0.358372300863266, + "learning_rate": 0.00011502836785855655, + "loss": 0.0224, + "step": 3225 + }, + { + "epoch": 0.8516248104937051, + "grad_norm": 0.01605542004108429, + "learning_rate": 0.00011489642433038662, + "loss": 0.0215, + "step": 3230 + }, + { + "epoch": 0.8529431151539121, + "grad_norm": 0.021189266815781593, + "learning_rate": 0.00011476448080221667, + "loss": 0.0051, + "step": 3235 + }, + { + "epoch": 0.8542614198141191, + "grad_norm": 0.013394076377153397, + "learning_rate": 0.0001146325372740467, + "loss": 0.021, + "step": 3240 + }, + { + "epoch": 0.8555797244743261, + "grad_norm": 0.19848507642745972, + "learning_rate": 0.00011450059374587676, + "loss": 0.0285, + "step": 3245 + }, + { + "epoch": 0.856898029134533, + "grad_norm": 0.2463046759366989, + "learning_rate": 0.00011436865021770683, + "loss": 0.0384, + "step": 3250 + }, + { + "epoch": 0.8582163337947399, + "grad_norm": 0.37432390451431274, + "learning_rate": 0.00011423670668953688, + "loss": 0.0098, + "step": 3255 + }, + { + "epoch": 0.8595346384549469, + "grad_norm": 0.060943394899368286, + "learning_rate": 0.00011410476316136694, + "loss": 0.0087, + "step": 3260 + }, + { + "epoch": 0.8608529431151539, + "grad_norm": 0.2846696674823761, + "learning_rate": 0.00011397281963319699, + "loss": 0.0148, + "step": 3265 + }, + { + "epoch": 0.8621712477753609, + "grad_norm": 0.009311323054134846, + "learning_rate": 0.00011384087610502705, + "loss": 0.0024, + "step": 3270 + }, + { + "epoch": 0.8634895524355679, + "grad_norm": 0.046277035027742386, + "learning_rate": 0.0001137089325768571, + "loss": 0.0274, + "step": 3275 + }, + { + "epoch": 0.8648078570957748, + "grad_norm": 0.006024620030075312, + "learning_rate": 0.00011357698904868716, + "loss": 0.0286, + "step": 3280 + }, + { + "epoch": 0.8661261617559818, + "grad_norm": 0.033578380942344666, + "learning_rate": 0.00011344504552051723, + "loss": 0.0153, + "step": 3285 + }, + { + "epoch": 0.8674444664161888, + "grad_norm": 0.8537917137145996, + "learning_rate": 0.00011331310199234728, + "loss": 0.0304, + "step": 3290 + }, + { + "epoch": 0.8687627710763958, + "grad_norm": 0.013933337293565273, + "learning_rate": 0.00011318115846417734, + "loss": 0.0112, + "step": 3295 + }, + { + "epoch": 0.8700810757366028, + "grad_norm": 0.35437721014022827, + "learning_rate": 0.0001130492149360074, + "loss": 0.0228, + "step": 3300 + }, + { + "epoch": 0.8713993803968098, + "grad_norm": 1.3024121522903442, + "learning_rate": 0.00011291727140783745, + "loss": 0.0203, + "step": 3305 + }, + { + "epoch": 0.8727176850570166, + "grad_norm": 0.5131255984306335, + "learning_rate": 0.00011278532787966751, + "loss": 0.0181, + "step": 3310 + }, + { + "epoch": 0.8740359897172236, + "grad_norm": 0.039366886019706726, + "learning_rate": 0.00011265338435149758, + "loss": 0.0192, + "step": 3315 + }, + { + "epoch": 0.8753542943774306, + "grad_norm": 0.13679669797420502, + "learning_rate": 0.00011252144082332763, + "loss": 0.004, + "step": 3320 + }, + { + "epoch": 0.8766725990376376, + "grad_norm": 0.003076886525377631, + "learning_rate": 0.00011238949729515769, + "loss": 0.0405, + "step": 3325 + }, + { + "epoch": 0.8779909036978446, + "grad_norm": 0.019953785464167595, + "learning_rate": 0.00011225755376698774, + "loss": 0.0241, + "step": 3330 + }, + { + "epoch": 0.8793092083580516, + "grad_norm": 0.007980377413332462, + "learning_rate": 0.0001121256102388178, + "loss": 0.0064, + "step": 3335 + }, + { + "epoch": 0.8806275130182585, + "grad_norm": 0.018761295825242996, + "learning_rate": 0.00011199366671064784, + "loss": 0.0032, + "step": 3340 + }, + { + "epoch": 0.8819458176784655, + "grad_norm": 0.022511709481477737, + "learning_rate": 0.0001118617231824779, + "loss": 0.0055, + "step": 3345 + }, + { + "epoch": 0.8832641223386725, + "grad_norm": 0.021270718425512314, + "learning_rate": 0.00011172977965430795, + "loss": 0.033, + "step": 3350 + }, + { + "epoch": 0.8845824269988795, + "grad_norm": 0.02710561640560627, + "learning_rate": 0.00011159783612613801, + "loss": 0.0094, + "step": 3355 + }, + { + "epoch": 0.8859007316590864, + "grad_norm": 0.4353378117084503, + "learning_rate": 0.00011146589259796806, + "loss": 0.0089, + "step": 3360 + }, + { + "epoch": 0.8872190363192934, + "grad_norm": 0.0257766991853714, + "learning_rate": 0.00011133394906979813, + "loss": 0.0059, + "step": 3365 + }, + { + "epoch": 0.8885373409795003, + "grad_norm": 0.80838942527771, + "learning_rate": 0.00011120200554162819, + "loss": 0.0263, + "step": 3370 + }, + { + "epoch": 0.8898556456397073, + "grad_norm": 0.007799761835485697, + "learning_rate": 0.00011107006201345824, + "loss": 0.0028, + "step": 3375 + }, + { + "epoch": 0.8911739502999143, + "grad_norm": 0.007315775845199823, + "learning_rate": 0.0001109381184852883, + "loss": 0.0127, + "step": 3380 + }, + { + "epoch": 0.8924922549601213, + "grad_norm": 1.4861233234405518, + "learning_rate": 0.00011080617495711836, + "loss": 0.0562, + "step": 3385 + }, + { + "epoch": 0.8938105596203283, + "grad_norm": 0.010219530202448368, + "learning_rate": 0.00011067423142894841, + "loss": 0.0438, + "step": 3390 + }, + { + "epoch": 0.8951288642805353, + "grad_norm": 1.0191857814788818, + "learning_rate": 0.00011054228790077848, + "loss": 0.0493, + "step": 3395 + }, + { + "epoch": 0.8964471689407422, + "grad_norm": 0.01459536887705326, + "learning_rate": 0.00011041034437260854, + "loss": 0.0117, + "step": 3400 + }, + { + "epoch": 0.8977654736009492, + "grad_norm": 0.008682495914399624, + "learning_rate": 0.00011027840084443859, + "loss": 0.02, + "step": 3405 + }, + { + "epoch": 0.8990837782611562, + "grad_norm": 0.02197263017296791, + "learning_rate": 0.00011014645731626865, + "loss": 0.0454, + "step": 3410 + }, + { + "epoch": 0.9004020829213631, + "grad_norm": 0.01436714269220829, + "learning_rate": 0.0001100145137880987, + "loss": 0.0283, + "step": 3415 + }, + { + "epoch": 0.9017203875815701, + "grad_norm": 0.14327946305274963, + "learning_rate": 0.00010988257025992876, + "loss": 0.0461, + "step": 3420 + }, + { + "epoch": 0.9030386922417771, + "grad_norm": 1.671773910522461, + "learning_rate": 0.00010975062673175883, + "loss": 0.054, + "step": 3425 + }, + { + "epoch": 0.904356996901984, + "grad_norm": 0.009926804341375828, + "learning_rate": 0.00010961868320358888, + "loss": 0.0429, + "step": 3430 + }, + { + "epoch": 0.905675301562191, + "grad_norm": 0.554020881652832, + "learning_rate": 0.00010948673967541894, + "loss": 0.0618, + "step": 3435 + }, + { + "epoch": 0.906993606222398, + "grad_norm": 0.1399248093366623, + "learning_rate": 0.00010935479614724897, + "loss": 0.0229, + "step": 3440 + }, + { + "epoch": 0.908311910882605, + "grad_norm": 0.02739197015762329, + "learning_rate": 0.00010922285261907904, + "loss": 0.0082, + "step": 3445 + }, + { + "epoch": 0.909630215542812, + "grad_norm": 0.33394527435302734, + "learning_rate": 0.00010909090909090909, + "loss": 0.0403, + "step": 3450 + }, + { + "epoch": 0.9109485202030189, + "grad_norm": 0.08083894103765488, + "learning_rate": 0.00010895896556273915, + "loss": 0.0406, + "step": 3455 + }, + { + "epoch": 0.9122668248632259, + "grad_norm": 0.39336663484573364, + "learning_rate": 0.0001088270220345692, + "loss": 0.02, + "step": 3460 + }, + { + "epoch": 0.9135851295234328, + "grad_norm": 0.20481553673744202, + "learning_rate": 0.00010869507850639926, + "loss": 0.0221, + "step": 3465 + }, + { + "epoch": 0.9149034341836398, + "grad_norm": 1.4507408142089844, + "learning_rate": 0.00010856313497822932, + "loss": 0.0357, + "step": 3470 + }, + { + "epoch": 0.9162217388438468, + "grad_norm": 0.2678806483745575, + "learning_rate": 0.00010843119145005937, + "loss": 0.0181, + "step": 3475 + }, + { + "epoch": 0.9175400435040538, + "grad_norm": 0.007361674215644598, + "learning_rate": 0.00010829924792188944, + "loss": 0.0978, + "step": 3480 + }, + { + "epoch": 0.9188583481642607, + "grad_norm": 0.773695707321167, + "learning_rate": 0.0001081673043937195, + "loss": 0.0401, + "step": 3485 + }, + { + "epoch": 0.9201766528244677, + "grad_norm": 0.0010772625682875514, + "learning_rate": 0.00010803536086554955, + "loss": 0.0233, + "step": 3490 + }, + { + "epoch": 0.9214949574846747, + "grad_norm": 0.08971104770898819, + "learning_rate": 0.00010790341733737961, + "loss": 0.0319, + "step": 3495 + }, + { + "epoch": 0.9228132621448817, + "grad_norm": 0.21372731029987335, + "learning_rate": 0.00010777147380920966, + "loss": 0.0315, + "step": 3500 + }, + { + "epoch": 0.9228132621448817, + "eval_loss": 0.02952708676457405, + "eval_runtime": 451.5837, + "eval_samples_per_second": 7.467, + "eval_steps_per_second": 3.734, + "step": 3500 + }, + { + "epoch": 0.9241315668050887, + "grad_norm": 0.016639264300465584, + "learning_rate": 0.00010763953028103972, + "loss": 0.0125, + "step": 3505 + }, + { + "epoch": 0.9254498714652957, + "grad_norm": 0.46340492367744446, + "learning_rate": 0.00010750758675286979, + "loss": 0.0186, + "step": 3510 + }, + { + "epoch": 0.9267681761255026, + "grad_norm": 0.01847526989877224, + "learning_rate": 0.00010737564322469984, + "loss": 0.0026, + "step": 3515 + }, + { + "epoch": 0.9280864807857095, + "grad_norm": 0.5947860479354858, + "learning_rate": 0.0001072436996965299, + "loss": 0.0259, + "step": 3520 + }, + { + "epoch": 0.9294047854459165, + "grad_norm": 0.06145291402935982, + "learning_rate": 0.00010711175616835995, + "loss": 0.0057, + "step": 3525 + }, + { + "epoch": 0.9307230901061235, + "grad_norm": 0.0143959429115057, + "learning_rate": 0.00010697981264019001, + "loss": 0.0145, + "step": 3530 + }, + { + "epoch": 0.9320413947663305, + "grad_norm": 0.21143831312656403, + "learning_rate": 0.00010684786911202007, + "loss": 0.0459, + "step": 3535 + }, + { + "epoch": 0.9333596994265375, + "grad_norm": 0.02548077143728733, + "learning_rate": 0.00010671592558385011, + "loss": 0.0051, + "step": 3540 + }, + { + "epoch": 0.9346780040867444, + "grad_norm": 0.008077048696577549, + "learning_rate": 0.00010658398205568016, + "loss": 0.0306, + "step": 3545 + }, + { + "epoch": 0.9359963087469514, + "grad_norm": 0.0030760422814637423, + "learning_rate": 0.00010645203852751022, + "loss": 0.0575, + "step": 3550 + }, + { + "epoch": 0.9373146134071584, + "grad_norm": 0.18114158511161804, + "learning_rate": 0.00010632009499934027, + "loss": 0.0885, + "step": 3555 + }, + { + "epoch": 0.9386329180673654, + "grad_norm": 0.02450549602508545, + "learning_rate": 0.00010618815147117034, + "loss": 0.0045, + "step": 3560 + }, + { + "epoch": 0.9399512227275724, + "grad_norm": 0.1238626018166542, + "learning_rate": 0.0001060562079430004, + "loss": 0.0166, + "step": 3565 + }, + { + "epoch": 0.9412695273877794, + "grad_norm": 0.1879919469356537, + "learning_rate": 0.00010592426441483046, + "loss": 0.0077, + "step": 3570 + }, + { + "epoch": 0.9425878320479862, + "grad_norm": 0.11323565989732742, + "learning_rate": 0.00010579232088666051, + "loss": 0.0213, + "step": 3575 + }, + { + "epoch": 0.9439061367081932, + "grad_norm": 0.35575854778289795, + "learning_rate": 0.00010566037735849057, + "loss": 0.0336, + "step": 3580 + }, + { + "epoch": 0.9452244413684002, + "grad_norm": 0.14052227139472961, + "learning_rate": 0.00010552843383032062, + "loss": 0.0325, + "step": 3585 + }, + { + "epoch": 0.9465427460286072, + "grad_norm": 0.2643798887729645, + "learning_rate": 0.00010539649030215069, + "loss": 0.0192, + "step": 3590 + }, + { + "epoch": 0.9478610506888142, + "grad_norm": 0.3207031190395355, + "learning_rate": 0.00010526454677398075, + "loss": 0.0221, + "step": 3595 + }, + { + "epoch": 0.9491793553490212, + "grad_norm": 0.022803861647844315, + "learning_rate": 0.0001051326032458108, + "loss": 0.029, + "step": 3600 + }, + { + "epoch": 0.9504976600092281, + "grad_norm": 0.02511664852499962, + "learning_rate": 0.00010500065971764086, + "loss": 0.0422, + "step": 3605 + }, + { + "epoch": 0.9518159646694351, + "grad_norm": 0.06505445390939713, + "learning_rate": 0.00010486871618947091, + "loss": 0.0092, + "step": 3610 + }, + { + "epoch": 0.9531342693296421, + "grad_norm": 0.09998584538698196, + "learning_rate": 0.00010473677266130097, + "loss": 0.0242, + "step": 3615 + }, + { + "epoch": 0.9544525739898491, + "grad_norm": 0.9645698666572571, + "learning_rate": 0.00010460482913313104, + "loss": 0.0124, + "step": 3620 + }, + { + "epoch": 0.955770878650056, + "grad_norm": 0.2389964610338211, + "learning_rate": 0.0001044728856049611, + "loss": 0.0169, + "step": 3625 + }, + { + "epoch": 0.957089183310263, + "grad_norm": 2.030608654022217, + "learning_rate": 0.00010434094207679115, + "loss": 0.0518, + "step": 3630 + }, + { + "epoch": 0.9584074879704699, + "grad_norm": 0.05979987606406212, + "learning_rate": 0.0001042089985486212, + "loss": 0.0081, + "step": 3635 + }, + { + "epoch": 0.9597257926306769, + "grad_norm": 0.15761719644069672, + "learning_rate": 0.00010407705502045125, + "loss": 0.0061, + "step": 3640 + }, + { + "epoch": 0.9610440972908839, + "grad_norm": 0.6534290909767151, + "learning_rate": 0.0001039451114922813, + "loss": 0.0104, + "step": 3645 + }, + { + "epoch": 0.9623624019510909, + "grad_norm": 1.0324147939682007, + "learning_rate": 0.00010381316796411136, + "loss": 0.0381, + "step": 3650 + }, + { + "epoch": 0.9636807066112979, + "grad_norm": 0.002968872431665659, + "learning_rate": 0.00010368122443594142, + "loss": 0.0343, + "step": 3655 + }, + { + "epoch": 0.9649990112715049, + "grad_norm": 0.011243184097111225, + "learning_rate": 0.00010354928090777147, + "loss": 0.019, + "step": 3660 + }, + { + "epoch": 0.9663173159317118, + "grad_norm": 0.17663739621639252, + "learning_rate": 0.00010341733737960153, + "loss": 0.0452, + "step": 3665 + }, + { + "epoch": 0.9676356205919188, + "grad_norm": 1.2647719383239746, + "learning_rate": 0.00010328539385143158, + "loss": 0.0154, + "step": 3670 + }, + { + "epoch": 0.9689539252521258, + "grad_norm": 0.3691752552986145, + "learning_rate": 0.00010315345032326165, + "loss": 0.028, + "step": 3675 + }, + { + "epoch": 0.9702722299123328, + "grad_norm": 0.0015879774000495672, + "learning_rate": 0.00010302150679509171, + "loss": 0.0202, + "step": 3680 + }, + { + "epoch": 0.9715905345725397, + "grad_norm": 0.1441984623670578, + "learning_rate": 0.00010288956326692176, + "loss": 0.0221, + "step": 3685 + }, + { + "epoch": 0.9729088392327467, + "grad_norm": 0.20431455969810486, + "learning_rate": 0.00010275761973875182, + "loss": 0.0072, + "step": 3690 + }, + { + "epoch": 0.9742271438929536, + "grad_norm": 0.861625611782074, + "learning_rate": 0.00010262567621058187, + "loss": 0.0523, + "step": 3695 + }, + { + "epoch": 0.9755454485531606, + "grad_norm": 0.005049478262662888, + "learning_rate": 0.00010249373268241193, + "loss": 0.0051, + "step": 3700 + }, + { + "epoch": 0.9768637532133676, + "grad_norm": 0.49685510993003845, + "learning_rate": 0.000102361789154242, + "loss": 0.023, + "step": 3705 + }, + { + "epoch": 0.9781820578735746, + "grad_norm": 0.08789395540952682, + "learning_rate": 0.00010222984562607205, + "loss": 0.0159, + "step": 3710 + }, + { + "epoch": 0.9795003625337816, + "grad_norm": 0.027168691158294678, + "learning_rate": 0.00010209790209790211, + "loss": 0.0083, + "step": 3715 + }, + { + "epoch": 0.9808186671939886, + "grad_norm": 0.0006773864733986557, + "learning_rate": 0.00010196595856973217, + "loss": 0.0048, + "step": 3720 + }, + { + "epoch": 0.9821369718541955, + "grad_norm": 0.01636457070708275, + "learning_rate": 0.00010183401504156222, + "loss": 0.0159, + "step": 3725 + }, + { + "epoch": 0.9834552765144025, + "grad_norm": 0.10160859674215317, + "learning_rate": 0.00010170207151339228, + "loss": 0.0047, + "step": 3730 + }, + { + "epoch": 0.9847735811746094, + "grad_norm": 0.14173269271850586, + "learning_rate": 0.00010157012798522232, + "loss": 0.006, + "step": 3735 + }, + { + "epoch": 0.9860918858348164, + "grad_norm": 0.003458512481302023, + "learning_rate": 0.00010143818445705238, + "loss": 0.0193, + "step": 3740 + }, + { + "epoch": 0.9874101904950234, + "grad_norm": 0.005163820460438728, + "learning_rate": 0.00010130624092888243, + "loss": 0.0039, + "step": 3745 + }, + { + "epoch": 0.9887284951552304, + "grad_norm": 0.005913791712373495, + "learning_rate": 0.00010117429740071249, + "loss": 0.0119, + "step": 3750 + }, + { + "epoch": 0.9900467998154373, + "grad_norm": 0.00800853967666626, + "learning_rate": 0.00010104235387254256, + "loss": 0.044, + "step": 3755 + }, + { + "epoch": 0.9913651044756443, + "grad_norm": 0.18146778643131256, + "learning_rate": 0.00010091041034437261, + "loss": 0.0048, + "step": 3760 + }, + { + "epoch": 0.9926834091358513, + "grad_norm": 0.01235104724764824, + "learning_rate": 0.00010077846681620267, + "loss": 0.0017, + "step": 3765 + }, + { + "epoch": 0.9940017137960583, + "grad_norm": 0.17677897214889526, + "learning_rate": 0.00010064652328803272, + "loss": 0.0339, + "step": 3770 + }, + { + "epoch": 0.9953200184562653, + "grad_norm": 0.0017472271574661136, + "learning_rate": 0.00010051457975986278, + "loss": 0.0494, + "step": 3775 + }, + { + "epoch": 0.9966383231164723, + "grad_norm": 0.10814860463142395, + "learning_rate": 0.00010038263623169283, + "loss": 0.0741, + "step": 3780 + }, + { + "epoch": 0.9979566277766792, + "grad_norm": 0.11329760402441025, + "learning_rate": 0.0001002506927035229, + "loss": 0.0182, + "step": 3785 + }, + { + "epoch": 0.9992749324368861, + "grad_norm": 0.11573276668787003, + "learning_rate": 0.00010011874917535296, + "loss": 0.0068, + "step": 3790 + }, + { + "epoch": 1.000790982796124, + "grad_norm": 0.08449886739253998, + "learning_rate": 9.998680564718301e-05, + "loss": 0.0141, + "step": 3795 + }, + { + "epoch": 1.002109287456331, + "grad_norm": 0.05035184696316719, + "learning_rate": 9.985486211901307e-05, + "loss": 0.0293, + "step": 3800 + }, + { + "epoch": 1.003427592116538, + "grad_norm": 0.0255444198846817, + "learning_rate": 9.972291859084313e-05, + "loss": 0.0054, + "step": 3805 + }, + { + "epoch": 1.004745896776745, + "grad_norm": 0.0033677336759865284, + "learning_rate": 9.959097506267318e-05, + "loss": 0.0567, + "step": 3810 + }, + { + "epoch": 1.006064201436952, + "grad_norm": 0.09453682601451874, + "learning_rate": 9.945903153450324e-05, + "loss": 0.0589, + "step": 3815 + }, + { + "epoch": 1.007382506097159, + "grad_norm": 0.01592979207634926, + "learning_rate": 9.932708800633329e-05, + "loss": 0.0043, + "step": 3820 + }, + { + "epoch": 1.008700810757366, + "grad_norm": 0.002263693604618311, + "learning_rate": 9.919514447816335e-05, + "loss": 0.0195, + "step": 3825 + }, + { + "epoch": 1.010019115417573, + "grad_norm": 0.013390793465077877, + "learning_rate": 9.90632009499934e-05, + "loss": 0.0152, + "step": 3830 + }, + { + "epoch": 1.01133742007778, + "grad_norm": 0.10473847389221191, + "learning_rate": 9.893125742182346e-05, + "loss": 0.0606, + "step": 3835 + }, + { + "epoch": 1.012655724737987, + "grad_norm": 0.05837221071124077, + "learning_rate": 9.879931389365353e-05, + "loss": 0.0121, + "step": 3840 + }, + { + "epoch": 1.013974029398194, + "grad_norm": 0.3803791105747223, + "learning_rate": 9.866737036548358e-05, + "loss": 0.0386, + "step": 3845 + }, + { + "epoch": 1.0152923340584008, + "grad_norm": 0.4067519009113312, + "learning_rate": 9.853542683731364e-05, + "loss": 0.0115, + "step": 3850 + }, + { + "epoch": 1.0166106387186078, + "grad_norm": 0.02585229091346264, + "learning_rate": 9.84034833091437e-05, + "loss": 0.0214, + "step": 3855 + }, + { + "epoch": 1.0179289433788148, + "grad_norm": 0.03670825809240341, + "learning_rate": 9.827153978097374e-05, + "loss": 0.0059, + "step": 3860 + }, + { + "epoch": 1.0192472480390218, + "grad_norm": 0.014171554706990719, + "learning_rate": 9.81395962528038e-05, + "loss": 0.0145, + "step": 3865 + }, + { + "epoch": 1.0205655526992288, + "grad_norm": 0.027376385405659676, + "learning_rate": 9.800765272463386e-05, + "loss": 0.0089, + "step": 3870 + }, + { + "epoch": 1.0218838573594358, + "grad_norm": 0.03168405964970589, + "learning_rate": 9.787570919646392e-05, + "loss": 0.0132, + "step": 3875 + }, + { + "epoch": 1.0232021620196428, + "grad_norm": 0.03346199914813042, + "learning_rate": 9.774376566829397e-05, + "loss": 0.0246, + "step": 3880 + }, + { + "epoch": 1.0245204666798498, + "grad_norm": 0.00894144270569086, + "learning_rate": 9.761182214012403e-05, + "loss": 0.0105, + "step": 3885 + }, + { + "epoch": 1.0258387713400567, + "grad_norm": 0.3172806203365326, + "learning_rate": 9.747987861195409e-05, + "loss": 0.0103, + "step": 3890 + }, + { + "epoch": 1.0271570760002637, + "grad_norm": 0.009055040776729584, + "learning_rate": 9.734793508378414e-05, + "loss": 0.0103, + "step": 3895 + }, + { + "epoch": 1.0284753806604707, + "grad_norm": 0.014140011742711067, + "learning_rate": 9.721599155561421e-05, + "loss": 0.0037, + "step": 3900 + }, + { + "epoch": 1.0297936853206777, + "grad_norm": 0.008317383006215096, + "learning_rate": 9.708404802744427e-05, + "loss": 0.002, + "step": 3905 + }, + { + "epoch": 1.0311119899808845, + "grad_norm": 0.005038558971136808, + "learning_rate": 9.695210449927431e-05, + "loss": 0.0017, + "step": 3910 + }, + { + "epoch": 1.0324302946410915, + "grad_norm": 0.40058520436286926, + "learning_rate": 9.682016097110436e-05, + "loss": 0.0065, + "step": 3915 + }, + { + "epoch": 1.0337485993012985, + "grad_norm": 0.005197151098400354, + "learning_rate": 9.668821744293442e-05, + "loss": 0.0031, + "step": 3920 + }, + { + "epoch": 1.0350669039615055, + "grad_norm": 0.014353781007230282, + "learning_rate": 9.655627391476449e-05, + "loss": 0.0009, + "step": 3925 + }, + { + "epoch": 1.0363852086217125, + "grad_norm": 0.13260559737682343, + "learning_rate": 9.642433038659454e-05, + "loss": 0.0323, + "step": 3930 + }, + { + "epoch": 1.0377035132819195, + "grad_norm": 0.006795065477490425, + "learning_rate": 9.62923868584246e-05, + "loss": 0.0022, + "step": 3935 + }, + { + "epoch": 1.0390218179421264, + "grad_norm": 0.2276086062192917, + "learning_rate": 9.616044333025466e-05, + "loss": 0.0221, + "step": 3940 + }, + { + "epoch": 1.0403401226023334, + "grad_norm": 0.06121920794248581, + "learning_rate": 9.602849980208471e-05, + "loss": 0.0037, + "step": 3945 + }, + { + "epoch": 1.0416584272625404, + "grad_norm": 0.9180755019187927, + "learning_rate": 9.589655627391477e-05, + "loss": 0.0589, + "step": 3950 + }, + { + "epoch": 1.0429767319227474, + "grad_norm": 0.07515591382980347, + "learning_rate": 9.576461274574484e-05, + "loss": 0.0653, + "step": 3955 + }, + { + "epoch": 1.0442950365829544, + "grad_norm": 0.018060607835650444, + "learning_rate": 9.563266921757488e-05, + "loss": 0.0178, + "step": 3960 + }, + { + "epoch": 1.0456133412431612, + "grad_norm": 0.02751368284225464, + "learning_rate": 9.550072568940493e-05, + "loss": 0.0076, + "step": 3965 + }, + { + "epoch": 1.0469316459033682, + "grad_norm": 0.653998613357544, + "learning_rate": 9.536878216123499e-05, + "loss": 0.0066, + "step": 3970 + }, + { + "epoch": 1.0482499505635752, + "grad_norm": 0.3117768168449402, + "learning_rate": 9.523683863306505e-05, + "loss": 0.0087, + "step": 3975 + }, + { + "epoch": 1.0495682552237822, + "grad_norm": 0.013952831737697124, + "learning_rate": 9.510489510489511e-05, + "loss": 0.0037, + "step": 3980 + }, + { + "epoch": 1.0508865598839892, + "grad_norm": 0.01806250400841236, + "learning_rate": 9.497295157672517e-05, + "loss": 0.0028, + "step": 3985 + }, + { + "epoch": 1.0522048645441962, + "grad_norm": 0.13678006827831268, + "learning_rate": 9.484100804855523e-05, + "loss": 0.0533, + "step": 3990 + }, + { + "epoch": 1.0535231692044031, + "grad_norm": 0.14869382977485657, + "learning_rate": 9.470906452038528e-05, + "loss": 0.009, + "step": 3995 + }, + { + "epoch": 1.0548414738646101, + "grad_norm": 0.33614659309387207, + "learning_rate": 9.457712099221534e-05, + "loss": 0.0555, + "step": 4000 + }, + { + "epoch": 1.0548414738646101, + "eval_loss": 0.026165226474404335, + "eval_runtime": 452.2482, + "eval_samples_per_second": 7.456, + "eval_steps_per_second": 3.728, + "step": 4000 + }, + { + "epoch": 1.0561597785248171, + "grad_norm": 0.007546027656644583, + "learning_rate": 9.444517746404539e-05, + "loss": 0.0029, + "step": 4005 + }, + { + "epoch": 1.0574780831850241, + "grad_norm": 0.3720332384109497, + "learning_rate": 9.431323393587545e-05, + "loss": 0.0353, + "step": 4010 + }, + { + "epoch": 1.0587963878452311, + "grad_norm": 1.1335264444351196, + "learning_rate": 9.41812904077055e-05, + "loss": 0.0142, + "step": 4015 + }, + { + "epoch": 1.060114692505438, + "grad_norm": 0.024723488837480545, + "learning_rate": 9.404934687953556e-05, + "loss": 0.006, + "step": 4020 + }, + { + "epoch": 1.0614329971656449, + "grad_norm": 0.040354058146476746, + "learning_rate": 9.391740335136562e-05, + "loss": 0.0107, + "step": 4025 + }, + { + "epoch": 1.0627513018258519, + "grad_norm": 0.222810298204422, + "learning_rate": 9.378545982319567e-05, + "loss": 0.0273, + "step": 4030 + }, + { + "epoch": 1.0640696064860589, + "grad_norm": 0.025684095919132233, + "learning_rate": 9.365351629502574e-05, + "loss": 0.0033, + "step": 4035 + }, + { + "epoch": 1.0653879111462659, + "grad_norm": 0.05338352546095848, + "learning_rate": 9.35215727668558e-05, + "loss": 0.0052, + "step": 4040 + }, + { + "epoch": 1.0667062158064728, + "grad_norm": 0.06182330474257469, + "learning_rate": 9.338962923868585e-05, + "loss": 0.0038, + "step": 4045 + }, + { + "epoch": 1.0680245204666798, + "grad_norm": 0.012170832604169846, + "learning_rate": 9.325768571051591e-05, + "loss": 0.0018, + "step": 4050 + }, + { + "epoch": 1.0693428251268868, + "grad_norm": 0.5424306392669678, + "learning_rate": 9.312574218234596e-05, + "loss": 0.0445, + "step": 4055 + }, + { + "epoch": 1.0706611297870938, + "grad_norm": 0.017939254641532898, + "learning_rate": 9.299379865417602e-05, + "loss": 0.0389, + "step": 4060 + }, + { + "epoch": 1.0719794344473008, + "grad_norm": 0.0060431682504713535, + "learning_rate": 9.286185512600607e-05, + "loss": 0.0025, + "step": 4065 + }, + { + "epoch": 1.0732977391075078, + "grad_norm": 0.0071444883942604065, + "learning_rate": 9.272991159783613e-05, + "loss": 0.0333, + "step": 4070 + }, + { + "epoch": 1.0746160437677148, + "grad_norm": 0.29632750153541565, + "learning_rate": 9.259796806966619e-05, + "loss": 0.0151, + "step": 4075 + }, + { + "epoch": 1.0759343484279218, + "grad_norm": 0.004526323173195124, + "learning_rate": 9.246602454149624e-05, + "loss": 0.006, + "step": 4080 + }, + { + "epoch": 1.0772526530881286, + "grad_norm": 0.023945212364196777, + "learning_rate": 9.23340810133263e-05, + "loss": 0.004, + "step": 4085 + }, + { + "epoch": 1.0785709577483356, + "grad_norm": 0.13235126435756683, + "learning_rate": 9.220213748515635e-05, + "loss": 0.0059, + "step": 4090 + }, + { + "epoch": 1.0798892624085425, + "grad_norm": 0.17592330276966095, + "learning_rate": 9.207019395698642e-05, + "loss": 0.0302, + "step": 4095 + }, + { + "epoch": 1.0812075670687495, + "grad_norm": 0.004582866560667753, + "learning_rate": 9.193825042881648e-05, + "loss": 0.009, + "step": 4100 + }, + { + "epoch": 1.0825258717289565, + "grad_norm": 0.15214525163173676, + "learning_rate": 9.180630690064653e-05, + "loss": 0.0062, + "step": 4105 + }, + { + "epoch": 1.0838441763891635, + "grad_norm": 0.16535983979701996, + "learning_rate": 9.167436337247658e-05, + "loss": 0.0926, + "step": 4110 + }, + { + "epoch": 1.0851624810493705, + "grad_norm": 0.013285227119922638, + "learning_rate": 9.154241984430663e-05, + "loss": 0.0043, + "step": 4115 + }, + { + "epoch": 1.0864807857095775, + "grad_norm": 0.012116984464228153, + "learning_rate": 9.14104763161367e-05, + "loss": 0.0037, + "step": 4120 + }, + { + "epoch": 1.0877990903697845, + "grad_norm": 0.0373845212161541, + "learning_rate": 9.127853278796676e-05, + "loss": 0.0081, + "step": 4125 + }, + { + "epoch": 1.0891173950299915, + "grad_norm": 0.09324615448713303, + "learning_rate": 9.114658925979681e-05, + "loss": 0.0534, + "step": 4130 + }, + { + "epoch": 1.0904356996901985, + "grad_norm": 0.010992968454957008, + "learning_rate": 9.101464573162687e-05, + "loss": 0.0025, + "step": 4135 + }, + { + "epoch": 1.0917540043504055, + "grad_norm": 0.13710318505764008, + "learning_rate": 9.088270220345692e-05, + "loss": 0.0555, + "step": 4140 + }, + { + "epoch": 1.0930723090106123, + "grad_norm": 0.010403074324131012, + "learning_rate": 9.075075867528698e-05, + "loss": 0.0042, + "step": 4145 + }, + { + "epoch": 1.0943906136708192, + "grad_norm": 0.21544460952281952, + "learning_rate": 9.061881514711705e-05, + "loss": 0.0144, + "step": 4150 + }, + { + "epoch": 1.0957089183310262, + "grad_norm": 0.04194799065589905, + "learning_rate": 9.04868716189471e-05, + "loss": 0.0106, + "step": 4155 + }, + { + "epoch": 1.0970272229912332, + "grad_norm": 0.029204202815890312, + "learning_rate": 9.035492809077715e-05, + "loss": 0.0085, + "step": 4160 + }, + { + "epoch": 1.0983455276514402, + "grad_norm": 0.006751026958227158, + "learning_rate": 9.02229845626072e-05, + "loss": 0.0049, + "step": 4165 + }, + { + "epoch": 1.0996638323116472, + "grad_norm": 0.008232722990214825, + "learning_rate": 9.009104103443726e-05, + "loss": 0.0172, + "step": 4170 + }, + { + "epoch": 1.1009821369718542, + "grad_norm": 0.05630079656839371, + "learning_rate": 8.995909750626733e-05, + "loss": 0.0112, + "step": 4175 + }, + { + "epoch": 1.1023004416320612, + "grad_norm": 0.0011601662263274193, + "learning_rate": 8.982715397809738e-05, + "loss": 0.0317, + "step": 4180 + }, + { + "epoch": 1.1036187462922682, + "grad_norm": 0.006554402410984039, + "learning_rate": 8.969521044992744e-05, + "loss": 0.0035, + "step": 4185 + }, + { + "epoch": 1.1049370509524752, + "grad_norm": 0.34513652324676514, + "learning_rate": 8.956326692175749e-05, + "loss": 0.0036, + "step": 4190 + }, + { + "epoch": 1.1062553556126822, + "grad_norm": 0.283669650554657, + "learning_rate": 8.943132339358755e-05, + "loss": 0.0182, + "step": 4195 + }, + { + "epoch": 1.1075736602728892, + "grad_norm": 0.5376952290534973, + "learning_rate": 8.92993798654176e-05, + "loss": 0.0293, + "step": 4200 + }, + { + "epoch": 1.108891964933096, + "grad_norm": 0.01689724065363407, + "learning_rate": 8.916743633724767e-05, + "loss": 0.0206, + "step": 4205 + }, + { + "epoch": 1.110210269593303, + "grad_norm": 0.026538770645856857, + "learning_rate": 8.903549280907772e-05, + "loss": 0.0181, + "step": 4210 + }, + { + "epoch": 1.11152857425351, + "grad_norm": 0.6372873783111572, + "learning_rate": 8.890354928090777e-05, + "loss": 0.021, + "step": 4215 + }, + { + "epoch": 1.112846878913717, + "grad_norm": 0.06177428737282753, + "learning_rate": 8.877160575273783e-05, + "loss": 0.0033, + "step": 4220 + }, + { + "epoch": 1.114165183573924, + "grad_norm": 0.3712109923362732, + "learning_rate": 8.863966222456788e-05, + "loss": 0.0075, + "step": 4225 + }, + { + "epoch": 1.115483488234131, + "grad_norm": 0.030514653772115707, + "learning_rate": 8.850771869639795e-05, + "loss": 0.0183, + "step": 4230 + }, + { + "epoch": 1.116801792894338, + "grad_norm": 0.012861707247793674, + "learning_rate": 8.837577516822801e-05, + "loss": 0.0032, + "step": 4235 + }, + { + "epoch": 1.118120097554545, + "grad_norm": 0.3278522789478302, + "learning_rate": 8.824383164005806e-05, + "loss": 0.0058, + "step": 4240 + }, + { + "epoch": 1.1194384022147519, + "grad_norm": 0.580259382724762, + "learning_rate": 8.811188811188812e-05, + "loss": 0.0068, + "step": 4245 + }, + { + "epoch": 1.1207567068749589, + "grad_norm": 0.007002575788646936, + "learning_rate": 8.797994458371817e-05, + "loss": 0.0063, + "step": 4250 + }, + { + "epoch": 1.1220750115351659, + "grad_norm": 0.22484643757343292, + "learning_rate": 8.784800105554823e-05, + "loss": 0.0167, + "step": 4255 + }, + { + "epoch": 1.1233933161953726, + "grad_norm": 0.004122686106711626, + "learning_rate": 8.771605752737829e-05, + "loss": 0.002, + "step": 4260 + }, + { + "epoch": 1.1247116208555796, + "grad_norm": 0.009832561016082764, + "learning_rate": 8.758411399920834e-05, + "loss": 0.0029, + "step": 4265 + }, + { + "epoch": 1.1260299255157866, + "grad_norm": 0.04854527860879898, + "learning_rate": 8.74521704710384e-05, + "loss": 0.0068, + "step": 4270 + }, + { + "epoch": 1.1273482301759936, + "grad_norm": 0.12221235036849976, + "learning_rate": 8.732022694286845e-05, + "loss": 0.003, + "step": 4275 + }, + { + "epoch": 1.1286665348362006, + "grad_norm": 0.005857539363205433, + "learning_rate": 8.718828341469851e-05, + "loss": 0.0022, + "step": 4280 + }, + { + "epoch": 1.1299848394964076, + "grad_norm": 0.10582758486270905, + "learning_rate": 8.705633988652856e-05, + "loss": 0.002, + "step": 4285 + }, + { + "epoch": 1.1313031441566146, + "grad_norm": 0.006190940272063017, + "learning_rate": 8.692439635835863e-05, + "loss": 0.0022, + "step": 4290 + }, + { + "epoch": 1.1326214488168216, + "grad_norm": 0.00221514655277133, + "learning_rate": 8.679245283018869e-05, + "loss": 0.0314, + "step": 4295 + }, + { + "epoch": 1.1339397534770286, + "grad_norm": 0.0796755850315094, + "learning_rate": 8.666050930201874e-05, + "loss": 0.0347, + "step": 4300 + }, + { + "epoch": 1.1352580581372356, + "grad_norm": 0.20088806748390198, + "learning_rate": 8.65285657738488e-05, + "loss": 0.0048, + "step": 4305 + }, + { + "epoch": 1.1365763627974426, + "grad_norm": 0.4018377363681793, + "learning_rate": 8.639662224567884e-05, + "loss": 0.0234, + "step": 4310 + }, + { + "epoch": 1.1378946674576496, + "grad_norm": 0.014961684122681618, + "learning_rate": 8.626467871750891e-05, + "loss": 0.0033, + "step": 4315 + }, + { + "epoch": 1.1392129721178565, + "grad_norm": 0.004534922540187836, + "learning_rate": 8.613273518933897e-05, + "loss": 0.0021, + "step": 4320 + }, + { + "epoch": 1.1405312767780633, + "grad_norm": 0.06340984255075455, + "learning_rate": 8.600079166116902e-05, + "loss": 0.0538, + "step": 4325 + }, + { + "epoch": 1.1418495814382703, + "grad_norm": 0.007374623324722052, + "learning_rate": 8.586884813299908e-05, + "loss": 0.0157, + "step": 4330 + }, + { + "epoch": 1.1431678860984773, + "grad_norm": 0.02313193492591381, + "learning_rate": 8.573690460482913e-05, + "loss": 0.0307, + "step": 4335 + }, + { + "epoch": 1.1444861907586843, + "grad_norm": 0.014071634039282799, + "learning_rate": 8.560496107665919e-05, + "loss": 0.0058, + "step": 4340 + }, + { + "epoch": 1.1458044954188913, + "grad_norm": 1.4664901494979858, + "learning_rate": 8.547301754848926e-05, + "loss": 0.0566, + "step": 4345 + }, + { + "epoch": 1.1471228000790983, + "grad_norm": 0.023680074140429497, + "learning_rate": 8.534107402031931e-05, + "loss": 0.0048, + "step": 4350 + }, + { + "epoch": 1.1484411047393053, + "grad_norm": 0.012555698864161968, + "learning_rate": 8.520913049214937e-05, + "loss": 0.0076, + "step": 4355 + }, + { + "epoch": 1.1497594093995123, + "grad_norm": 0.013624129816889763, + "learning_rate": 8.507718696397941e-05, + "loss": 0.0373, + "step": 4360 + }, + { + "epoch": 1.1510777140597193, + "grad_norm": 0.015372387133538723, + "learning_rate": 8.494524343580947e-05, + "loss": 0.0147, + "step": 4365 + }, + { + "epoch": 1.1523960187199263, + "grad_norm": 0.3312993347644806, + "learning_rate": 8.481329990763954e-05, + "loss": 0.0299, + "step": 4370 + }, + { + "epoch": 1.1537143233801332, + "grad_norm": 0.023838184773921967, + "learning_rate": 8.468135637946959e-05, + "loss": 0.0226, + "step": 4375 + }, + { + "epoch": 1.15503262804034, + "grad_norm": 0.42516952753067017, + "learning_rate": 8.454941285129965e-05, + "loss": 0.0088, + "step": 4380 + }, + { + "epoch": 1.156350932700547, + "grad_norm": 0.6900278925895691, + "learning_rate": 8.44174693231297e-05, + "loss": 0.0245, + "step": 4385 + }, + { + "epoch": 1.157669237360754, + "grad_norm": 0.2932703197002411, + "learning_rate": 8.428552579495976e-05, + "loss": 0.0207, + "step": 4390 + }, + { + "epoch": 1.158987542020961, + "grad_norm": 0.12942780554294586, + "learning_rate": 8.415358226678982e-05, + "loss": 0.0037, + "step": 4395 + }, + { + "epoch": 1.160305846681168, + "grad_norm": 0.9499046802520752, + "learning_rate": 8.402163873861989e-05, + "loss": 0.0246, + "step": 4400 + }, + { + "epoch": 1.161624151341375, + "grad_norm": 0.008869118988513947, + "learning_rate": 8.388969521044994e-05, + "loss": 0.0171, + "step": 4405 + }, + { + "epoch": 1.162942456001582, + "grad_norm": 1.7409231662750244, + "learning_rate": 8.375775168227998e-05, + "loss": 0.017, + "step": 4410 + }, + { + "epoch": 1.164260760661789, + "grad_norm": 0.0020101398695260286, + "learning_rate": 8.362580815411004e-05, + "loss": 0.0027, + "step": 4415 + }, + { + "epoch": 1.165579065321996, + "grad_norm": 0.0785067081451416, + "learning_rate": 8.34938646259401e-05, + "loss": 0.0043, + "step": 4420 + }, + { + "epoch": 1.166897369982203, + "grad_norm": 0.0029506285209208727, + "learning_rate": 8.336192109777016e-05, + "loss": 0.0109, + "step": 4425 + }, + { + "epoch": 1.16821567464241, + "grad_norm": 0.02216683328151703, + "learning_rate": 8.322997756960022e-05, + "loss": 0.0026, + "step": 4430 + }, + { + "epoch": 1.1695339793026167, + "grad_norm": 0.02216639369726181, + "learning_rate": 8.309803404143027e-05, + "loss": 0.0045, + "step": 4435 + }, + { + "epoch": 1.170852283962824, + "grad_norm": 0.0, + "learning_rate": 8.296609051326033e-05, + "loss": 0.006, + "step": 4440 + }, + { + "epoch": 1.1721705886230307, + "grad_norm": 0.0019736960530281067, + "learning_rate": 8.283414698509039e-05, + "loss": 0.0078, + "step": 4445 + }, + { + "epoch": 1.1734888932832377, + "grad_norm": 0.012957746163010597, + "learning_rate": 8.270220345692044e-05, + "loss": 0.002, + "step": 4450 + }, + { + "epoch": 1.1748071979434447, + "grad_norm": 0.010877869091928005, + "learning_rate": 8.25702599287505e-05, + "loss": 0.0237, + "step": 4455 + }, + { + "epoch": 1.1761255026036517, + "grad_norm": 0.005947659723460674, + "learning_rate": 8.243831640058055e-05, + "loss": 0.0341, + "step": 4460 + }, + { + "epoch": 1.1774438072638587, + "grad_norm": 0.0005026470171287656, + "learning_rate": 8.230637287241061e-05, + "loss": 0.0033, + "step": 4465 + }, + { + "epoch": 1.1787621119240657, + "grad_norm": 0.022054588422179222, + "learning_rate": 8.217442934424066e-05, + "loss": 0.0042, + "step": 4470 + }, + { + "epoch": 1.1800804165842727, + "grad_norm": 0.7929030656814575, + "learning_rate": 8.204248581607072e-05, + "loss": 0.0076, + "step": 4475 + }, + { + "epoch": 1.1813987212444796, + "grad_norm": 0.39052629470825195, + "learning_rate": 8.191054228790078e-05, + "loss": 0.0228, + "step": 4480 + }, + { + "epoch": 1.1827170259046866, + "grad_norm": 0.007177622988820076, + "learning_rate": 8.177859875973084e-05, + "loss": 0.01, + "step": 4485 + }, + { + "epoch": 1.1840353305648936, + "grad_norm": 0.006175135262310505, + "learning_rate": 8.16466552315609e-05, + "loss": 0.0037, + "step": 4490 + }, + { + "epoch": 1.1853536352251006, + "grad_norm": 0.0356481671333313, + "learning_rate": 8.151471170339096e-05, + "loss": 0.0024, + "step": 4495 + }, + { + "epoch": 1.1866719398853074, + "grad_norm": 0.19069480895996094, + "learning_rate": 8.138276817522101e-05, + "loss": 0.0048, + "step": 4500 + }, + { + "epoch": 1.1866719398853074, + "eval_loss": 0.026386437937617302, + "eval_runtime": 452.2896, + "eval_samples_per_second": 7.455, + "eval_steps_per_second": 3.728, + "step": 4500 + }, + { + "epoch": 1.1879902445455144, + "grad_norm": 0.002254961524158716, + "learning_rate": 8.125082464705107e-05, + "loss": 0.0014, + "step": 4505 + }, + { + "epoch": 1.1893085492057214, + "grad_norm": 0.8026870489120483, + "learning_rate": 8.111888111888112e-05, + "loss": 0.0411, + "step": 4510 + }, + { + "epoch": 1.1906268538659284, + "grad_norm": 0.47328072786331177, + "learning_rate": 8.098693759071118e-05, + "loss": 0.0271, + "step": 4515 + }, + { + "epoch": 1.1919451585261354, + "grad_norm": 0.4888288676738739, + "learning_rate": 8.085499406254123e-05, + "loss": 0.039, + "step": 4520 + }, + { + "epoch": 1.1932634631863424, + "grad_norm": 0.000925812462810427, + "learning_rate": 8.072305053437129e-05, + "loss": 0.0461, + "step": 4525 + }, + { + "epoch": 1.1945817678465493, + "grad_norm": 0.12472371757030487, + "learning_rate": 8.059110700620135e-05, + "loss": 0.0037, + "step": 4530 + }, + { + "epoch": 1.1959000725067563, + "grad_norm": 0.002875336678698659, + "learning_rate": 8.04591634780314e-05, + "loss": 0.0425, + "step": 4535 + }, + { + "epoch": 1.1972183771669633, + "grad_norm": 0.042056187987327576, + "learning_rate": 8.032721994986147e-05, + "loss": 0.0068, + "step": 4540 + }, + { + "epoch": 1.1985366818271703, + "grad_norm": 0.157605841755867, + "learning_rate": 8.019527642169153e-05, + "loss": 0.0179, + "step": 4545 + }, + { + "epoch": 1.1998549864873773, + "grad_norm": 0.005153563339263201, + "learning_rate": 8.006333289352158e-05, + "loss": 0.0045, + "step": 4550 + }, + { + "epoch": 1.201173291147584, + "grad_norm": 0.02541598491370678, + "learning_rate": 7.993138936535164e-05, + "loss": 0.0041, + "step": 4555 + }, + { + "epoch": 1.2024915958077913, + "grad_norm": 0.04266195371747017, + "learning_rate": 7.979944583718168e-05, + "loss": 0.0121, + "step": 4560 + }, + { + "epoch": 1.203809900467998, + "grad_norm": 0.36108532547950745, + "learning_rate": 7.966750230901175e-05, + "loss": 0.0147, + "step": 4565 + }, + { + "epoch": 1.205128205128205, + "grad_norm": 0.40405452251434326, + "learning_rate": 7.95355587808418e-05, + "loss": 0.0056, + "step": 4570 + }, + { + "epoch": 1.206446509788412, + "grad_norm": 0.030422702431678772, + "learning_rate": 7.940361525267186e-05, + "loss": 0.0055, + "step": 4575 + }, + { + "epoch": 1.207764814448619, + "grad_norm": 0.014555396512150764, + "learning_rate": 7.927167172450192e-05, + "loss": 0.0029, + "step": 4580 + }, + { + "epoch": 1.209083119108826, + "grad_norm": 0.33962950110435486, + "learning_rate": 7.913972819633197e-05, + "loss": 0.0191, + "step": 4585 + }, + { + "epoch": 1.210401423769033, + "grad_norm": 0.040150560438632965, + "learning_rate": 7.900778466816203e-05, + "loss": 0.0096, + "step": 4590 + }, + { + "epoch": 1.21171972842924, + "grad_norm": 0.2968510091304779, + "learning_rate": 7.88758411399921e-05, + "loss": 0.0311, + "step": 4595 + }, + { + "epoch": 1.213038033089447, + "grad_norm": 0.04709814116358757, + "learning_rate": 7.874389761182215e-05, + "loss": 0.0175, + "step": 4600 + }, + { + "epoch": 1.214356337749654, + "grad_norm": 0.1379537284374237, + "learning_rate": 7.861195408365221e-05, + "loss": 0.02, + "step": 4605 + }, + { + "epoch": 1.215674642409861, + "grad_norm": 0.018291711807250977, + "learning_rate": 7.848001055548225e-05, + "loss": 0.003, + "step": 4610 + }, + { + "epoch": 1.216992947070068, + "grad_norm": 0.041676126420497894, + "learning_rate": 7.83480670273123e-05, + "loss": 0.0054, + "step": 4615 + }, + { + "epoch": 1.2183112517302748, + "grad_norm": 0.0013747498160228133, + "learning_rate": 7.821612349914237e-05, + "loss": 0.0132, + "step": 4620 + }, + { + "epoch": 1.2196295563904818, + "grad_norm": 0.0050489697605371475, + "learning_rate": 7.808417997097243e-05, + "loss": 0.0272, + "step": 4625 + }, + { + "epoch": 1.2209478610506888, + "grad_norm": 0.017974581569433212, + "learning_rate": 7.795223644280249e-05, + "loss": 0.0037, + "step": 4630 + }, + { + "epoch": 1.2222661657108957, + "grad_norm": 0.001916698063723743, + "learning_rate": 7.782029291463254e-05, + "loss": 0.002, + "step": 4635 + }, + { + "epoch": 1.2235844703711027, + "grad_norm": 0.05344574153423309, + "learning_rate": 7.76883493864626e-05, + "loss": 0.0114, + "step": 4640 + }, + { + "epoch": 1.2249027750313097, + "grad_norm": 0.22823786735534668, + "learning_rate": 7.755640585829265e-05, + "loss": 0.0296, + "step": 4645 + }, + { + "epoch": 1.2262210796915167, + "grad_norm": 0.02051074244081974, + "learning_rate": 7.742446233012272e-05, + "loss": 0.0037, + "step": 4650 + }, + { + "epoch": 1.2275393843517237, + "grad_norm": 0.9797061681747437, + "learning_rate": 7.729251880195276e-05, + "loss": 0.011, + "step": 4655 + }, + { + "epoch": 1.2288576890119307, + "grad_norm": 0.0017285927897319198, + "learning_rate": 7.716057527378282e-05, + "loss": 0.0224, + "step": 4660 + }, + { + "epoch": 1.2301759936721377, + "grad_norm": 0.021783018484711647, + "learning_rate": 7.702863174561288e-05, + "loss": 0.0174, + "step": 4665 + }, + { + "epoch": 1.2314942983323447, + "grad_norm": 0.00763307698071003, + "learning_rate": 7.689668821744293e-05, + "loss": 0.0516, + "step": 4670 + }, + { + "epoch": 1.2328126029925515, + "grad_norm": 0.32605209946632385, + "learning_rate": 7.676474468927299e-05, + "loss": 0.0301, + "step": 4675 + }, + { + "epoch": 1.2341309076527585, + "grad_norm": 1.2027722597122192, + "learning_rate": 7.663280116110306e-05, + "loss": 0.0474, + "step": 4680 + }, + { + "epoch": 1.2354492123129655, + "grad_norm": 0.10201717168092728, + "learning_rate": 7.650085763293311e-05, + "loss": 0.0144, + "step": 4685 + }, + { + "epoch": 1.2367675169731724, + "grad_norm": 0.013835664838552475, + "learning_rate": 7.636891410476317e-05, + "loss": 0.0024, + "step": 4690 + }, + { + "epoch": 1.2380858216333794, + "grad_norm": 0.005699916277080774, + "learning_rate": 7.623697057659322e-05, + "loss": 0.0089, + "step": 4695 + }, + { + "epoch": 1.2394041262935864, + "grad_norm": 0.16583332419395447, + "learning_rate": 7.610502704842328e-05, + "loss": 0.019, + "step": 4700 + }, + { + "epoch": 1.2407224309537934, + "grad_norm": 0.2734023332595825, + "learning_rate": 7.597308352025333e-05, + "loss": 0.0041, + "step": 4705 + }, + { + "epoch": 1.2420407356140004, + "grad_norm": 0.04209504276514053, + "learning_rate": 7.584113999208339e-05, + "loss": 0.0292, + "step": 4710 + }, + { + "epoch": 1.2433590402742074, + "grad_norm": 0.0303195733577013, + "learning_rate": 7.570919646391345e-05, + "loss": 0.0019, + "step": 4715 + }, + { + "epoch": 1.2446773449344144, + "grad_norm": 0.014011899940669537, + "learning_rate": 7.55772529357435e-05, + "loss": 0.0236, + "step": 4720 + }, + { + "epoch": 1.2459956495946214, + "grad_norm": 0.37838876247406006, + "learning_rate": 7.544530940757356e-05, + "loss": 0.0081, + "step": 4725 + }, + { + "epoch": 1.2473139542548284, + "grad_norm": 0.003717717481777072, + "learning_rate": 7.531336587940361e-05, + "loss": 0.0036, + "step": 4730 + }, + { + "epoch": 1.2486322589150354, + "grad_norm": 1.2284752130508423, + "learning_rate": 7.518142235123368e-05, + "loss": 0.0089, + "step": 4735 + }, + { + "epoch": 1.2499505635752421, + "grad_norm": 0.015356095507740974, + "learning_rate": 7.504947882306374e-05, + "loss": 0.0074, + "step": 4740 + }, + { + "epoch": 1.2512688682354491, + "grad_norm": 0.0020383282098919153, + "learning_rate": 7.49175352948938e-05, + "loss": 0.0444, + "step": 4745 + }, + { + "epoch": 1.2525871728956561, + "grad_norm": 0.006680132355540991, + "learning_rate": 7.478559176672385e-05, + "loss": 0.009, + "step": 4750 + }, + { + "epoch": 1.2539054775558631, + "grad_norm": 0.01650019735097885, + "learning_rate": 7.465364823855389e-05, + "loss": 0.0022, + "step": 4755 + }, + { + "epoch": 1.2552237822160701, + "grad_norm": 0.009536102414131165, + "learning_rate": 7.452170471038396e-05, + "loss": 0.0026, + "step": 4760 + }, + { + "epoch": 1.256542086876277, + "grad_norm": 0.04677430912852287, + "learning_rate": 7.438976118221402e-05, + "loss": 0.004, + "step": 4765 + }, + { + "epoch": 1.257860391536484, + "grad_norm": 0.007777783088386059, + "learning_rate": 7.425781765404407e-05, + "loss": 0.0112, + "step": 4770 + }, + { + "epoch": 1.259178696196691, + "grad_norm": 0.03724197298288345, + "learning_rate": 7.412587412587413e-05, + "loss": 0.0065, + "step": 4775 + }, + { + "epoch": 1.260497000856898, + "grad_norm": 0.0023958412930369377, + "learning_rate": 7.399393059770418e-05, + "loss": 0.0238, + "step": 4780 + }, + { + "epoch": 1.261815305517105, + "grad_norm": 0.0036889975890517235, + "learning_rate": 7.386198706953424e-05, + "loss": 0.0012, + "step": 4785 + }, + { + "epoch": 1.263133610177312, + "grad_norm": 0.0009220903157256544, + "learning_rate": 7.373004354136431e-05, + "loss": 0.0017, + "step": 4790 + }, + { + "epoch": 1.2644519148375188, + "grad_norm": 0.0033395602367818356, + "learning_rate": 7.359810001319436e-05, + "loss": 0.0474, + "step": 4795 + }, + { + "epoch": 1.265770219497726, + "grad_norm": 0.004093261435627937, + "learning_rate": 7.346615648502442e-05, + "loss": 0.0025, + "step": 4800 + }, + { + "epoch": 1.2670885241579328, + "grad_norm": 0.004395488649606705, + "learning_rate": 7.333421295685446e-05, + "loss": 0.0011, + "step": 4805 + }, + { + "epoch": 1.2684068288181398, + "grad_norm": 0.024034051224589348, + "learning_rate": 7.320226942868452e-05, + "loss": 0.0027, + "step": 4810 + }, + { + "epoch": 1.2697251334783468, + "grad_norm": 0.9501499533653259, + "learning_rate": 7.307032590051459e-05, + "loss": 0.0279, + "step": 4815 + }, + { + "epoch": 1.2710434381385538, + "grad_norm": 0.008805549703538418, + "learning_rate": 7.293838237234464e-05, + "loss": 0.0403, + "step": 4820 + }, + { + "epoch": 1.2723617427987608, + "grad_norm": 0.01750873774290085, + "learning_rate": 7.28064388441747e-05, + "loss": 0.0571, + "step": 4825 + }, + { + "epoch": 1.2736800474589678, + "grad_norm": 0.004490260500460863, + "learning_rate": 7.267449531600475e-05, + "loss": 0.0269, + "step": 4830 + }, + { + "epoch": 1.2749983521191748, + "grad_norm": 0.07510064542293549, + "learning_rate": 7.254255178783481e-05, + "loss": 0.0123, + "step": 4835 + }, + { + "epoch": 1.2763166567793818, + "grad_norm": 0.039783038198947906, + "learning_rate": 7.241060825966486e-05, + "loss": 0.0137, + "step": 4840 + }, + { + "epoch": 1.2776349614395888, + "grad_norm": 0.019004900008440018, + "learning_rate": 7.227866473149493e-05, + "loss": 0.0047, + "step": 4845 + }, + { + "epoch": 1.2789532660997955, + "grad_norm": 0.04813052713871002, + "learning_rate": 7.214672120332499e-05, + "loss": 0.0021, + "step": 4850 + }, + { + "epoch": 1.2802715707600028, + "grad_norm": 0.00835048221051693, + "learning_rate": 7.201477767515503e-05, + "loss": 0.0014, + "step": 4855 + }, + { + "epoch": 1.2815898754202095, + "grad_norm": 0.008609198965132236, + "learning_rate": 7.188283414698509e-05, + "loss": 0.0219, + "step": 4860 + }, + { + "epoch": 1.2829081800804165, + "grad_norm": 0.007337458431720734, + "learning_rate": 7.175089061881514e-05, + "loss": 0.0014, + "step": 4865 + }, + { + "epoch": 1.2842264847406235, + "grad_norm": 0.0032645913306623697, + "learning_rate": 7.161894709064521e-05, + "loss": 0.0026, + "step": 4870 + }, + { + "epoch": 1.2855447894008305, + "grad_norm": 0.27384671568870544, + "learning_rate": 7.148700356247527e-05, + "loss": 0.0227, + "step": 4875 + }, + { + "epoch": 1.2868630940610375, + "grad_norm": 0.03584875538945198, + "learning_rate": 7.135506003430532e-05, + "loss": 0.0299, + "step": 4880 + }, + { + "epoch": 1.2881813987212445, + "grad_norm": 0.03482440486550331, + "learning_rate": 7.122311650613538e-05, + "loss": 0.0125, + "step": 4885 + }, + { + "epoch": 1.2894997033814515, + "grad_norm": 0.005974395200610161, + "learning_rate": 7.109117297796543e-05, + "loss": 0.0029, + "step": 4890 + }, + { + "epoch": 1.2908180080416585, + "grad_norm": 0.01820153370499611, + "learning_rate": 7.095922944979549e-05, + "loss": 0.0254, + "step": 4895 + }, + { + "epoch": 1.2921363127018655, + "grad_norm": 0.1733965277671814, + "learning_rate": 7.082728592162555e-05, + "loss": 0.028, + "step": 4900 + }, + { + "epoch": 1.2934546173620725, + "grad_norm": 1.3017303943634033, + "learning_rate": 7.06953423934556e-05, + "loss": 0.0213, + "step": 4905 + }, + { + "epoch": 1.2947729220222794, + "grad_norm": 0.01360877975821495, + "learning_rate": 7.056339886528566e-05, + "loss": 0.0039, + "step": 4910 + }, + { + "epoch": 1.2960912266824862, + "grad_norm": 0.01503999624401331, + "learning_rate": 7.043145533711571e-05, + "loss": 0.0102, + "step": 4915 + }, + { + "epoch": 1.2974095313426934, + "grad_norm": 0.2200804352760315, + "learning_rate": 7.029951180894577e-05, + "loss": 0.0461, + "step": 4920 + }, + { + "epoch": 1.2987278360029002, + "grad_norm": 0.08512946963310242, + "learning_rate": 7.016756828077582e-05, + "loss": 0.0066, + "step": 4925 + }, + { + "epoch": 1.3000461406631072, + "grad_norm": 0.08296570926904678, + "learning_rate": 7.00356247526059e-05, + "loss": 0.0223, + "step": 4930 + }, + { + "epoch": 1.3013644453233142, + "grad_norm": 0.008866079151630402, + "learning_rate": 6.990368122443595e-05, + "loss": 0.0032, + "step": 4935 + }, + { + "epoch": 1.3026827499835212, + "grad_norm": 0.024493014439940453, + "learning_rate": 6.9771737696266e-05, + "loss": 0.0128, + "step": 4940 + }, + { + "epoch": 1.3040010546437282, + "grad_norm": 0.08965341746807098, + "learning_rate": 6.963979416809606e-05, + "loss": 0.028, + "step": 4945 + }, + { + "epoch": 1.3053193593039352, + "grad_norm": 0.023156631737947464, + "learning_rate": 6.950785063992612e-05, + "loss": 0.0187, + "step": 4950 + }, + { + "epoch": 1.3066376639641422, + "grad_norm": 0.18552155792713165, + "learning_rate": 6.937590711175617e-05, + "loss": 0.0424, + "step": 4955 + }, + { + "epoch": 1.3079559686243492, + "grad_norm": 0.02200198918581009, + "learning_rate": 6.924396358358623e-05, + "loss": 0.0148, + "step": 4960 + }, + { + "epoch": 1.3092742732845561, + "grad_norm": 0.00568364467471838, + "learning_rate": 6.911202005541628e-05, + "loss": 0.0199, + "step": 4965 + }, + { + "epoch": 1.310592577944763, + "grad_norm": 0.021591177210211754, + "learning_rate": 6.898007652724634e-05, + "loss": 0.0092, + "step": 4970 + }, + { + "epoch": 1.3119108826049701, + "grad_norm": 0.327177494764328, + "learning_rate": 6.88481329990764e-05, + "loss": 0.0047, + "step": 4975 + }, + { + "epoch": 1.313229187265177, + "grad_norm": 0.024512887001037598, + "learning_rate": 6.871618947090645e-05, + "loss": 0.0046, + "step": 4980 + }, + { + "epoch": 1.314547491925384, + "grad_norm": 0.05725006014108658, + "learning_rate": 6.858424594273652e-05, + "loss": 0.0227, + "step": 4985 + }, + { + "epoch": 1.3158657965855909, + "grad_norm": 0.011280277743935585, + "learning_rate": 6.845230241456658e-05, + "loss": 0.0056, + "step": 4990 + }, + { + "epoch": 1.3171841012457979, + "grad_norm": 0.022504402324557304, + "learning_rate": 6.832035888639663e-05, + "loss": 0.0029, + "step": 4995 + }, + { + "epoch": 1.3185024059060049, + "grad_norm": 0.02168826013803482, + "learning_rate": 6.818841535822669e-05, + "loss": 0.0198, + "step": 5000 + }, + { + "epoch": 1.3185024059060049, + "eval_loss": 0.025039294734597206, + "eval_runtime": 452.1097, + "eval_samples_per_second": 7.458, + "eval_steps_per_second": 3.729, + "step": 5000 + }, + { + "epoch": 1.3198207105662119, + "grad_norm": 0.0064329709857702255, + "learning_rate": 6.805647183005673e-05, + "loss": 0.0299, + "step": 5005 + }, + { + "epoch": 1.3211390152264189, + "grad_norm": 0.00267885928042233, + "learning_rate": 6.79245283018868e-05, + "loss": 0.0065, + "step": 5010 + }, + { + "epoch": 1.3224573198866258, + "grad_norm": 0.6842889189720154, + "learning_rate": 6.779258477371685e-05, + "loss": 0.008, + "step": 5015 + }, + { + "epoch": 1.3237756245468328, + "grad_norm": 0.002985635306686163, + "learning_rate": 6.766064124554691e-05, + "loss": 0.0119, + "step": 5020 + }, + { + "epoch": 1.3250939292070396, + "grad_norm": 0.019304940477013588, + "learning_rate": 6.752869771737696e-05, + "loss": 0.0041, + "step": 5025 + }, + { + "epoch": 1.3264122338672468, + "grad_norm": 0.011305035091936588, + "learning_rate": 6.739675418920702e-05, + "loss": 0.0031, + "step": 5030 + }, + { + "epoch": 1.3277305385274536, + "grad_norm": 0.006184784695506096, + "learning_rate": 6.726481066103708e-05, + "loss": 0.0081, + "step": 5035 + }, + { + "epoch": 1.3290488431876606, + "grad_norm": 0.0073184361681342125, + "learning_rate": 6.713286713286715e-05, + "loss": 0.0202, + "step": 5040 + }, + { + "epoch": 1.3303671478478676, + "grad_norm": 0.006566181313246489, + "learning_rate": 6.70009236046972e-05, + "loss": 0.0052, + "step": 5045 + }, + { + "epoch": 1.3316854525080746, + "grad_norm": 0.31427526473999023, + "learning_rate": 6.686898007652726e-05, + "loss": 0.017, + "step": 5050 + }, + { + "epoch": 1.3330037571682816, + "grad_norm": 0.005085447803139687, + "learning_rate": 6.67370365483573e-05, + "loss": 0.009, + "step": 5055 + }, + { + "epoch": 1.3343220618284886, + "grad_norm": 0.2745366096496582, + "learning_rate": 6.660509302018735e-05, + "loss": 0.0119, + "step": 5060 + }, + { + "epoch": 1.3356403664886956, + "grad_norm": 0.2871796786785126, + "learning_rate": 6.647314949201742e-05, + "loss": 0.0158, + "step": 5065 + }, + { + "epoch": 1.3369586711489025, + "grad_norm": 0.2774186134338379, + "learning_rate": 6.634120596384748e-05, + "loss": 0.0084, + "step": 5070 + }, + { + "epoch": 1.3382769758091095, + "grad_norm": 0.013278775848448277, + "learning_rate": 6.620926243567753e-05, + "loss": 0.0111, + "step": 5075 + }, + { + "epoch": 1.3395952804693165, + "grad_norm": 0.01614517532289028, + "learning_rate": 6.607731890750759e-05, + "loss": 0.0066, + "step": 5080 + }, + { + "epoch": 1.3409135851295235, + "grad_norm": 0.0037789656780660152, + "learning_rate": 6.594537537933765e-05, + "loss": 0.0142, + "step": 5085 + }, + { + "epoch": 1.3422318897897303, + "grad_norm": 0.03221861273050308, + "learning_rate": 6.58134318511677e-05, + "loss": 0.0155, + "step": 5090 + }, + { + "epoch": 1.3435501944499375, + "grad_norm": 0.005637989845126867, + "learning_rate": 6.568148832299776e-05, + "loss": 0.0022, + "step": 5095 + }, + { + "epoch": 1.3448684991101443, + "grad_norm": 0.0017844432732090354, + "learning_rate": 6.554954479482783e-05, + "loss": 0.0217, + "step": 5100 + }, + { + "epoch": 1.3461868037703513, + "grad_norm": 0.08099021762609482, + "learning_rate": 6.541760126665787e-05, + "loss": 0.0222, + "step": 5105 + }, + { + "epoch": 1.3475051084305583, + "grad_norm": 0.011909045279026031, + "learning_rate": 6.528565773848792e-05, + "loss": 0.0058, + "step": 5110 + }, + { + "epoch": 1.3488234130907653, + "grad_norm": 0.7332578301429749, + "learning_rate": 6.515371421031798e-05, + "loss": 0.0286, + "step": 5115 + }, + { + "epoch": 1.3501417177509722, + "grad_norm": 0.3415885865688324, + "learning_rate": 6.502177068214804e-05, + "loss": 0.1191, + "step": 5120 + }, + { + "epoch": 1.3514600224111792, + "grad_norm": 0.00904211588203907, + "learning_rate": 6.48898271539781e-05, + "loss": 0.0043, + "step": 5125 + }, + { + "epoch": 1.3527783270713862, + "grad_norm": 0.1978830248117447, + "learning_rate": 6.475788362580816e-05, + "loss": 0.0316, + "step": 5130 + }, + { + "epoch": 1.3540966317315932, + "grad_norm": 0.10229042172431946, + "learning_rate": 6.462594009763822e-05, + "loss": 0.0194, + "step": 5135 + }, + { + "epoch": 1.3554149363918002, + "grad_norm": 0.4457210600376129, + "learning_rate": 6.449399656946827e-05, + "loss": 0.0276, + "step": 5140 + }, + { + "epoch": 1.356733241052007, + "grad_norm": 0.023706572130322456, + "learning_rate": 6.436205304129833e-05, + "loss": 0.0163, + "step": 5145 + }, + { + "epoch": 1.3580515457122142, + "grad_norm": 1.166896939277649, + "learning_rate": 6.423010951312838e-05, + "loss": 0.0189, + "step": 5150 + }, + { + "epoch": 1.359369850372421, + "grad_norm": 0.0016115796752274036, + "learning_rate": 6.409816598495844e-05, + "loss": 0.0191, + "step": 5155 + }, + { + "epoch": 1.360688155032628, + "grad_norm": 0.00786682777106762, + "learning_rate": 6.39662224567885e-05, + "loss": 0.0119, + "step": 5160 + }, + { + "epoch": 1.362006459692835, + "grad_norm": 1.042732834815979, + "learning_rate": 6.383427892861855e-05, + "loss": 0.0497, + "step": 5165 + }, + { + "epoch": 1.363324764353042, + "grad_norm": 0.007983304560184479, + "learning_rate": 6.37023354004486e-05, + "loss": 0.044, + "step": 5170 + }, + { + "epoch": 1.364643069013249, + "grad_norm": 0.009767642244696617, + "learning_rate": 6.357039187227866e-05, + "loss": 0.0405, + "step": 5175 + }, + { + "epoch": 1.365961373673456, + "grad_norm": 0.03164628520607948, + "learning_rate": 6.343844834410873e-05, + "loss": 0.0138, + "step": 5180 + }, + { + "epoch": 1.367279678333663, + "grad_norm": 0.004159921780228615, + "learning_rate": 6.330650481593879e-05, + "loss": 0.0045, + "step": 5185 + }, + { + "epoch": 1.36859798299387, + "grad_norm": 0.004395391326397657, + "learning_rate": 6.317456128776884e-05, + "loss": 0.0046, + "step": 5190 + }, + { + "epoch": 1.369916287654077, + "grad_norm": 0.011886746622622013, + "learning_rate": 6.30426177595989e-05, + "loss": 0.0064, + "step": 5195 + }, + { + "epoch": 1.371234592314284, + "grad_norm": 0.2259266972541809, + "learning_rate": 6.291067423142895e-05, + "loss": 0.0076, + "step": 5200 + }, + { + "epoch": 1.372552896974491, + "grad_norm": 0.01407301053404808, + "learning_rate": 6.277873070325901e-05, + "loss": 0.0201, + "step": 5205 + }, + { + "epoch": 1.3738712016346977, + "grad_norm": 0.00911578256636858, + "learning_rate": 6.264678717508906e-05, + "loss": 0.0164, + "step": 5210 + }, + { + "epoch": 1.3751895062949049, + "grad_norm": 0.20968014001846313, + "learning_rate": 6.251484364691912e-05, + "loss": 0.0075, + "step": 5215 + }, + { + "epoch": 1.3765078109551117, + "grad_norm": 0.008801166899502277, + "learning_rate": 6.238290011874918e-05, + "loss": 0.0068, + "step": 5220 + }, + { + "epoch": 1.3778261156153186, + "grad_norm": 0.007181806955486536, + "learning_rate": 6.225095659057923e-05, + "loss": 0.0136, + "step": 5225 + }, + { + "epoch": 1.3791444202755256, + "grad_norm": 0.7527109980583191, + "learning_rate": 6.211901306240929e-05, + "loss": 0.0287, + "step": 5230 + }, + { + "epoch": 1.3804627249357326, + "grad_norm": 0.039015207439661026, + "learning_rate": 6.198706953423936e-05, + "loss": 0.0326, + "step": 5235 + }, + { + "epoch": 1.3817810295959396, + "grad_norm": 0.021076606586575508, + "learning_rate": 6.185512600606941e-05, + "loss": 0.0191, + "step": 5240 + }, + { + "epoch": 1.3830993342561466, + "grad_norm": 0.016630731523036957, + "learning_rate": 6.172318247789947e-05, + "loss": 0.0131, + "step": 5245 + }, + { + "epoch": 1.3844176389163536, + "grad_norm": 0.011133644729852676, + "learning_rate": 6.159123894972952e-05, + "loss": 0.0029, + "step": 5250 + }, + { + "epoch": 1.3857359435765606, + "grad_norm": 0.6434677243232727, + "learning_rate": 6.145929542155957e-05, + "loss": 0.0091, + "step": 5255 + }, + { + "epoch": 1.3870542482367676, + "grad_norm": 0.051020298153162, + "learning_rate": 6.132735189338964e-05, + "loss": 0.0086, + "step": 5260 + }, + { + "epoch": 1.3883725528969744, + "grad_norm": 0.016413932666182518, + "learning_rate": 6.119540836521969e-05, + "loss": 0.0061, + "step": 5265 + }, + { + "epoch": 1.3896908575571816, + "grad_norm": 0.005769540090113878, + "learning_rate": 6.106346483704975e-05, + "loss": 0.0027, + "step": 5270 + }, + { + "epoch": 1.3910091622173884, + "grad_norm": 0.06687796860933304, + "learning_rate": 6.09315213088798e-05, + "loss": 0.0423, + "step": 5275 + }, + { + "epoch": 1.3923274668775953, + "grad_norm": 0.005641553085297346, + "learning_rate": 6.079957778070986e-05, + "loss": 0.0353, + "step": 5280 + }, + { + "epoch": 1.3936457715378023, + "grad_norm": 0.04460568353533745, + "learning_rate": 6.066763425253992e-05, + "loss": 0.0041, + "step": 5285 + }, + { + "epoch": 1.3949640761980093, + "grad_norm": 0.0387534461915493, + "learning_rate": 6.0535690724369976e-05, + "loss": 0.006, + "step": 5290 + }, + { + "epoch": 1.3962823808582163, + "grad_norm": 0.010292598977684975, + "learning_rate": 6.040374719620003e-05, + "loss": 0.0038, + "step": 5295 + }, + { + "epoch": 1.3976006855184233, + "grad_norm": 0.3646155297756195, + "learning_rate": 6.0271803668030094e-05, + "loss": 0.0111, + "step": 5300 + }, + { + "epoch": 1.3989189901786303, + "grad_norm": 0.022035539150238037, + "learning_rate": 6.0139860139860136e-05, + "loss": 0.0507, + "step": 5305 + }, + { + "epoch": 1.4002372948388373, + "grad_norm": 0.003314939560368657, + "learning_rate": 6.00079166116902e-05, + "loss": 0.0132, + "step": 5310 + }, + { + "epoch": 1.4015555994990443, + "grad_norm": 0.0838267058134079, + "learning_rate": 5.9875973083520254e-05, + "loss": 0.0105, + "step": 5315 + }, + { + "epoch": 1.4028739041592513, + "grad_norm": 0.009368584491312504, + "learning_rate": 5.974402955535031e-05, + "loss": 0.0026, + "step": 5320 + }, + { + "epoch": 1.4041922088194583, + "grad_norm": 0.031248098239302635, + "learning_rate": 5.961208602718037e-05, + "loss": 0.0151, + "step": 5325 + }, + { + "epoch": 1.405510513479665, + "grad_norm": 0.06447605788707733, + "learning_rate": 5.948014249901043e-05, + "loss": 0.0219, + "step": 5330 + }, + { + "epoch": 1.4068288181398723, + "grad_norm": 0.010814374312758446, + "learning_rate": 5.9348198970840484e-05, + "loss": 0.0038, + "step": 5335 + }, + { + "epoch": 1.408147122800079, + "grad_norm": 0.6235967874526978, + "learning_rate": 5.9216255442670546e-05, + "loss": 0.0354, + "step": 5340 + }, + { + "epoch": 1.409465427460286, + "grad_norm": 0.026741521432995796, + "learning_rate": 5.90843119145006e-05, + "loss": 0.0032, + "step": 5345 + }, + { + "epoch": 1.410783732120493, + "grad_norm": 0.019413433969020844, + "learning_rate": 5.895236838633066e-05, + "loss": 0.0216, + "step": 5350 + }, + { + "epoch": 1.4121020367807, + "grad_norm": 0.0735543966293335, + "learning_rate": 5.8820424858160706e-05, + "loss": 0.0033, + "step": 5355 + }, + { + "epoch": 1.413420341440907, + "grad_norm": 0.005189546383917332, + "learning_rate": 5.868848132999076e-05, + "loss": 0.021, + "step": 5360 + }, + { + "epoch": 1.414738646101114, + "grad_norm": 0.21240335702896118, + "learning_rate": 5.8556537801820824e-05, + "loss": 0.0294, + "step": 5365 + }, + { + "epoch": 1.416056950761321, + "grad_norm": 0.010165920481085777, + "learning_rate": 5.842459427365088e-05, + "loss": 0.0021, + "step": 5370 + }, + { + "epoch": 1.417375255421528, + "grad_norm": 0.026774069294333458, + "learning_rate": 5.8292650745480936e-05, + "loss": 0.0299, + "step": 5375 + }, + { + "epoch": 1.418693560081735, + "grad_norm": 0.0019810455851256847, + "learning_rate": 5.816070721731099e-05, + "loss": 0.0029, + "step": 5380 + }, + { + "epoch": 1.4200118647419417, + "grad_norm": 0.038888879120349884, + "learning_rate": 5.8028763689141054e-05, + "loss": 0.0069, + "step": 5385 + }, + { + "epoch": 1.421330169402149, + "grad_norm": 0.016180936247110367, + "learning_rate": 5.789682016097111e-05, + "loss": 0.0032, + "step": 5390 + }, + { + "epoch": 1.4226484740623557, + "grad_norm": 0.01119404286146164, + "learning_rate": 5.7764876632801165e-05, + "loss": 0.0024, + "step": 5395 + }, + { + "epoch": 1.4239667787225627, + "grad_norm": 0.010486694052815437, + "learning_rate": 5.763293310463123e-05, + "loss": 0.0324, + "step": 5400 + }, + { + "epoch": 1.4252850833827697, + "grad_norm": 0.005453066434711218, + "learning_rate": 5.750098957646127e-05, + "loss": 0.0038, + "step": 5405 + }, + { + "epoch": 1.4266033880429767, + "grad_norm": 0.17556461691856384, + "learning_rate": 5.736904604829133e-05, + "loss": 0.0305, + "step": 5410 + }, + { + "epoch": 1.4279216927031837, + "grad_norm": 0.03074715845286846, + "learning_rate": 5.723710252012139e-05, + "loss": 0.003, + "step": 5415 + }, + { + "epoch": 1.4292399973633907, + "grad_norm": 1.7238941192626953, + "learning_rate": 5.710515899195144e-05, + "loss": 0.0254, + "step": 5420 + }, + { + "epoch": 1.4305583020235977, + "grad_norm": 0.012462320737540722, + "learning_rate": 5.6973215463781506e-05, + "loss": 0.0018, + "step": 5425 + }, + { + "epoch": 1.4318766066838047, + "grad_norm": 0.021576853469014168, + "learning_rate": 5.684127193561156e-05, + "loss": 0.0472, + "step": 5430 + }, + { + "epoch": 1.4331949113440117, + "grad_norm": 0.2862134575843811, + "learning_rate": 5.670932840744162e-05, + "loss": 0.0258, + "step": 5435 + }, + { + "epoch": 1.4345132160042184, + "grad_norm": 0.28419312834739685, + "learning_rate": 5.657738487927168e-05, + "loss": 0.0053, + "step": 5440 + }, + { + "epoch": 1.4358315206644257, + "grad_norm": 0.013650139793753624, + "learning_rate": 5.6445441351101735e-05, + "loss": 0.0126, + "step": 5445 + }, + { + "epoch": 1.4371498253246324, + "grad_norm": 0.01203097216784954, + "learning_rate": 5.631349782293179e-05, + "loss": 0.0076, + "step": 5450 + }, + { + "epoch": 1.4384681299848394, + "grad_norm": 0.0881054624915123, + "learning_rate": 5.618155429476184e-05, + "loss": 0.0178, + "step": 5455 + }, + { + "epoch": 1.4397864346450464, + "grad_norm": 0.5258516669273376, + "learning_rate": 5.6049610766591895e-05, + "loss": 0.0112, + "step": 5460 + }, + { + "epoch": 1.4411047393052534, + "grad_norm": 0.001202153041958809, + "learning_rate": 5.591766723842196e-05, + "loss": 0.0089, + "step": 5465 + }, + { + "epoch": 1.4424230439654604, + "grad_norm": 0.4498993456363678, + "learning_rate": 5.5785723710252014e-05, + "loss": 0.0252, + "step": 5470 + }, + { + "epoch": 1.4437413486256674, + "grad_norm": 0.17477644979953766, + "learning_rate": 5.565378018208207e-05, + "loss": 0.0169, + "step": 5475 + }, + { + "epoch": 1.4450596532858744, + "grad_norm": 0.019443338736891747, + "learning_rate": 5.552183665391213e-05, + "loss": 0.0019, + "step": 5480 + }, + { + "epoch": 1.4463779579460814, + "grad_norm": 0.005653039086610079, + "learning_rate": 5.538989312574219e-05, + "loss": 0.0231, + "step": 5485 + }, + { + "epoch": 1.4476962626062884, + "grad_norm": 0.01554112322628498, + "learning_rate": 5.525794959757224e-05, + "loss": 0.0167, + "step": 5490 + }, + { + "epoch": 1.4490145672664954, + "grad_norm": 0.044272180646657944, + "learning_rate": 5.5126006069402305e-05, + "loss": 0.007, + "step": 5495 + }, + { + "epoch": 1.4503328719267023, + "grad_norm": 0.014857172966003418, + "learning_rate": 5.499406254123236e-05, + "loss": 0.0045, + "step": 5500 + }, + { + "epoch": 1.4503328719267023, + "eval_loss": 0.02392147295176983, + "eval_runtime": 452.468, + "eval_samples_per_second": 7.452, + "eval_steps_per_second": 3.726, + "step": 5500 + }, + { + "epoch": 1.4516511765869091, + "grad_norm": 0.007390835788100958, + "learning_rate": 5.486211901306241e-05, + "loss": 0.0171, + "step": 5505 + }, + { + "epoch": 1.4529694812471163, + "grad_norm": 0.0050474610179662704, + "learning_rate": 5.4730175484892466e-05, + "loss": 0.004, + "step": 5510 + }, + { + "epoch": 1.454287785907323, + "grad_norm": 0.08066163957118988, + "learning_rate": 5.459823195672252e-05, + "loss": 0.0103, + "step": 5515 + }, + { + "epoch": 1.45560609056753, + "grad_norm": 0.0062376330606639385, + "learning_rate": 5.4466288428552584e-05, + "loss": 0.0066, + "step": 5520 + }, + { + "epoch": 1.456924395227737, + "grad_norm": 0.00711809890344739, + "learning_rate": 5.433434490038264e-05, + "loss": 0.003, + "step": 5525 + }, + { + "epoch": 1.458242699887944, + "grad_norm": 0.004010149277746677, + "learning_rate": 5.4202401372212695e-05, + "loss": 0.0231, + "step": 5530 + }, + { + "epoch": 1.459561004548151, + "grad_norm": 0.4791967272758484, + "learning_rate": 5.407045784404276e-05, + "loss": 0.0277, + "step": 5535 + }, + { + "epoch": 1.460879309208358, + "grad_norm": 0.03979189693927765, + "learning_rate": 5.393851431587281e-05, + "loss": 0.0033, + "step": 5540 + }, + { + "epoch": 1.462197613868565, + "grad_norm": 0.03331119939684868, + "learning_rate": 5.380657078770287e-05, + "loss": 0.0187, + "step": 5545 + }, + { + "epoch": 1.463515918528772, + "grad_norm": 0.0042802803218364716, + "learning_rate": 5.367462725953293e-05, + "loss": 0.0032, + "step": 5550 + }, + { + "epoch": 1.464834223188979, + "grad_norm": 0.05439918115735054, + "learning_rate": 5.354268373136297e-05, + "loss": 0.0043, + "step": 5555 + }, + { + "epoch": 1.4661525278491858, + "grad_norm": 0.042643506079912186, + "learning_rate": 5.3410740203193036e-05, + "loss": 0.0059, + "step": 5560 + }, + { + "epoch": 1.467470832509393, + "grad_norm": 0.023453116416931152, + "learning_rate": 5.327879667502309e-05, + "loss": 0.0043, + "step": 5565 + }, + { + "epoch": 1.4687891371695998, + "grad_norm": 0.037712760269641876, + "learning_rate": 5.314685314685315e-05, + "loss": 0.0033, + "step": 5570 + }, + { + "epoch": 1.4701074418298068, + "grad_norm": 1.0485608577728271, + "learning_rate": 5.301490961868321e-05, + "loss": 0.0489, + "step": 5575 + }, + { + "epoch": 1.4714257464900138, + "grad_norm": 0.004728829488158226, + "learning_rate": 5.2882966090513265e-05, + "loss": 0.0067, + "step": 5580 + }, + { + "epoch": 1.4727440511502208, + "grad_norm": 0.027893677353858948, + "learning_rate": 5.275102256234332e-05, + "loss": 0.0208, + "step": 5585 + }, + { + "epoch": 1.4740623558104278, + "grad_norm": 0.02256879396736622, + "learning_rate": 5.2619079034173377e-05, + "loss": 0.0036, + "step": 5590 + }, + { + "epoch": 1.4753806604706348, + "grad_norm": 0.12636558711528778, + "learning_rate": 5.248713550600344e-05, + "loss": 0.0046, + "step": 5595 + }, + { + "epoch": 1.4766989651308418, + "grad_norm": 0.000997041119262576, + "learning_rate": 5.235519197783348e-05, + "loss": 0.0101, + "step": 5600 + }, + { + "epoch": 1.4780172697910487, + "grad_norm": 0.023494020104408264, + "learning_rate": 5.2223248449663543e-05, + "loss": 0.0039, + "step": 5605 + }, + { + "epoch": 1.4793355744512557, + "grad_norm": 0.01525307446718216, + "learning_rate": 5.20913049214936e-05, + "loss": 0.021, + "step": 5610 + }, + { + "epoch": 1.4806538791114627, + "grad_norm": 0.0024215306621044874, + "learning_rate": 5.1959361393323655e-05, + "loss": 0.0017, + "step": 5615 + }, + { + "epoch": 1.4819721837716697, + "grad_norm": 1.4708061218261719, + "learning_rate": 5.182741786515372e-05, + "loss": 0.04, + "step": 5620 + }, + { + "epoch": 1.4832904884318765, + "grad_norm": 0.015033531002700329, + "learning_rate": 5.169547433698377e-05, + "loss": 0.0042, + "step": 5625 + }, + { + "epoch": 1.4846087930920837, + "grad_norm": 0.0035444959066808224, + "learning_rate": 5.156353080881383e-05, + "loss": 0.0087, + "step": 5630 + }, + { + "epoch": 1.4859270977522905, + "grad_norm": 0.010087919421494007, + "learning_rate": 5.143158728064389e-05, + "loss": 0.0158, + "step": 5635 + }, + { + "epoch": 1.4872454024124975, + "grad_norm": 0.05779251083731651, + "learning_rate": 5.129964375247395e-05, + "loss": 0.0157, + "step": 5640 + }, + { + "epoch": 1.4885637070727045, + "grad_norm": 0.14927980303764343, + "learning_rate": 5.1167700224304e-05, + "loss": 0.0257, + "step": 5645 + }, + { + "epoch": 1.4898820117329115, + "grad_norm": 0.004252352751791477, + "learning_rate": 5.103575669613405e-05, + "loss": 0.0198, + "step": 5650 + }, + { + "epoch": 1.4912003163931185, + "grad_norm": 0.0029206848703324795, + "learning_rate": 5.090381316796411e-05, + "loss": 0.0016, + "step": 5655 + }, + { + "epoch": 1.4925186210533254, + "grad_norm": 0.005047530401498079, + "learning_rate": 5.077186963979417e-05, + "loss": 0.0023, + "step": 5660 + }, + { + "epoch": 1.4938369257135324, + "grad_norm": 0.003732564626261592, + "learning_rate": 5.0639926111624225e-05, + "loss": 0.0336, + "step": 5665 + }, + { + "epoch": 1.4951552303737394, + "grad_norm": 0.3832889497280121, + "learning_rate": 5.050798258345428e-05, + "loss": 0.0476, + "step": 5670 + }, + { + "epoch": 1.4964735350339464, + "grad_norm": 0.06733009219169617, + "learning_rate": 5.037603905528434e-05, + "loss": 0.0044, + "step": 5675 + }, + { + "epoch": 1.4977918396941532, + "grad_norm": 0.008067069575190544, + "learning_rate": 5.02440955271144e-05, + "loss": 0.0035, + "step": 5680 + }, + { + "epoch": 1.4991101443543604, + "grad_norm": 0.01706300489604473, + "learning_rate": 5.0112151998944454e-05, + "loss": 0.0031, + "step": 5685 + }, + { + "epoch": 1.5004284490145672, + "grad_norm": 0.009932024404406548, + "learning_rate": 4.998020847077451e-05, + "loss": 0.0587, + "step": 5690 + }, + { + "epoch": 1.5017467536747744, + "grad_norm": 0.006488936021924019, + "learning_rate": 4.9848264942604566e-05, + "loss": 0.002, + "step": 5695 + }, + { + "epoch": 1.5030650583349812, + "grad_norm": 0.17488756775856018, + "learning_rate": 4.971632141443462e-05, + "loss": 0.0245, + "step": 5700 + }, + { + "epoch": 1.5043833629951882, + "grad_norm": 0.3327178359031677, + "learning_rate": 4.9584377886264684e-05, + "loss": 0.0404, + "step": 5705 + }, + { + "epoch": 1.5057016676553951, + "grad_norm": 0.18467263877391815, + "learning_rate": 4.945243435809474e-05, + "loss": 0.0248, + "step": 5710 + }, + { + "epoch": 1.5070199723156021, + "grad_norm": 0.020061776041984558, + "learning_rate": 4.9320490829924795e-05, + "loss": 0.0034, + "step": 5715 + }, + { + "epoch": 1.5083382769758091, + "grad_norm": 0.0005288647953420877, + "learning_rate": 4.918854730175485e-05, + "loss": 0.0076, + "step": 5720 + }, + { + "epoch": 1.5096565816360161, + "grad_norm": 0.007515576668083668, + "learning_rate": 4.9056603773584906e-05, + "loss": 0.004, + "step": 5725 + }, + { + "epoch": 1.5109748862962231, + "grad_norm": 0.05365758761763573, + "learning_rate": 4.892466024541497e-05, + "loss": 0.0222, + "step": 5730 + }, + { + "epoch": 1.51229319095643, + "grad_norm": 0.00572391040623188, + "learning_rate": 4.8792716717245025e-05, + "loss": 0.0132, + "step": 5735 + }, + { + "epoch": 1.513611495616637, + "grad_norm": 0.21178627014160156, + "learning_rate": 4.8660773189075073e-05, + "loss": 0.0417, + "step": 5740 + }, + { + "epoch": 1.5149298002768439, + "grad_norm": 0.0641486868262291, + "learning_rate": 4.8528829660905136e-05, + "loss": 0.011, + "step": 5745 + }, + { + "epoch": 1.516248104937051, + "grad_norm": 0.04451924189925194, + "learning_rate": 4.839688613273519e-05, + "loss": 0.012, + "step": 5750 + }, + { + "epoch": 1.5175664095972579, + "grad_norm": 0.019951259717345238, + "learning_rate": 4.826494260456525e-05, + "loss": 0.009, + "step": 5755 + }, + { + "epoch": 1.5188847142574649, + "grad_norm": 0.021919893100857735, + "learning_rate": 4.813299907639531e-05, + "loss": 0.0081, + "step": 5760 + }, + { + "epoch": 1.5202030189176718, + "grad_norm": 0.5730367302894592, + "learning_rate": 4.800105554822536e-05, + "loss": 0.0254, + "step": 5765 + }, + { + "epoch": 1.5215213235778788, + "grad_norm": 0.02501523122191429, + "learning_rate": 4.786911202005542e-05, + "loss": 0.0045, + "step": 5770 + }, + { + "epoch": 1.5228396282380858, + "grad_norm": 0.01574208028614521, + "learning_rate": 4.773716849188548e-05, + "loss": 0.0081, + "step": 5775 + }, + { + "epoch": 1.5241579328982928, + "grad_norm": 0.009626791812479496, + "learning_rate": 4.760522496371553e-05, + "loss": 0.0037, + "step": 5780 + }, + { + "epoch": 1.5254762375584998, + "grad_norm": 0.535539448261261, + "learning_rate": 4.747328143554559e-05, + "loss": 0.0149, + "step": 5785 + }, + { + "epoch": 1.5267945422187066, + "grad_norm": 0.004934845492243767, + "learning_rate": 4.7341337907375644e-05, + "loss": 0.0048, + "step": 5790 + }, + { + "epoch": 1.5281128468789138, + "grad_norm": 0.009070080704987049, + "learning_rate": 4.72093943792057e-05, + "loss": 0.0028, + "step": 5795 + }, + { + "epoch": 1.5294311515391206, + "grad_norm": 0.0040720063261687756, + "learning_rate": 4.707745085103576e-05, + "loss": 0.0016, + "step": 5800 + }, + { + "epoch": 1.5307494561993278, + "grad_norm": 0.45212000608444214, + "learning_rate": 4.694550732286582e-05, + "loss": 0.0111, + "step": 5805 + }, + { + "epoch": 1.5320677608595346, + "grad_norm": 0.024048497900366783, + "learning_rate": 4.681356379469587e-05, + "loss": 0.0149, + "step": 5810 + }, + { + "epoch": 1.5333860655197418, + "grad_norm": 0.11899136006832123, + "learning_rate": 4.668162026652593e-05, + "loss": 0.0034, + "step": 5815 + }, + { + "epoch": 1.5347043701799485, + "grad_norm": 0.011249657720327377, + "learning_rate": 4.6549676738355984e-05, + "loss": 0.0052, + "step": 5820 + }, + { + "epoch": 1.5360226748401555, + "grad_norm": 0.051634710282087326, + "learning_rate": 4.641773321018604e-05, + "loss": 0.0031, + "step": 5825 + }, + { + "epoch": 1.5373409795003625, + "grad_norm": 0.3726826012134552, + "learning_rate": 4.62857896820161e-05, + "loss": 0.0582, + "step": 5830 + }, + { + "epoch": 1.5386592841605695, + "grad_norm": 0.5827310681343079, + "learning_rate": 4.615384615384616e-05, + "loss": 0.0652, + "step": 5835 + }, + { + "epoch": 1.5399775888207765, + "grad_norm": 0.006390869617462158, + "learning_rate": 4.6021902625676214e-05, + "loss": 0.0022, + "step": 5840 + }, + { + "epoch": 1.5412958934809835, + "grad_norm": 0.022760871797800064, + "learning_rate": 4.588995909750627e-05, + "loss": 0.0311, + "step": 5845 + }, + { + "epoch": 1.5426141981411905, + "grad_norm": 0.22773241996765137, + "learning_rate": 4.5758015569336325e-05, + "loss": 0.0051, + "step": 5850 + }, + { + "epoch": 1.5439325028013973, + "grad_norm": 0.015375247225165367, + "learning_rate": 4.562607204116639e-05, + "loss": 0.0023, + "step": 5855 + }, + { + "epoch": 1.5452508074616045, + "grad_norm": 0.007347101345658302, + "learning_rate": 4.549412851299644e-05, + "loss": 0.0437, + "step": 5860 + }, + { + "epoch": 1.5465691121218113, + "grad_norm": 0.012344900518655777, + "learning_rate": 4.536218498482649e-05, + "loss": 0.004, + "step": 5865 + }, + { + "epoch": 1.5478874167820185, + "grad_norm": 0.27038896083831787, + "learning_rate": 4.5230241456656555e-05, + "loss": 0.0047, + "step": 5870 + }, + { + "epoch": 1.5492057214422252, + "grad_norm": 0.016395213082432747, + "learning_rate": 4.509829792848661e-05, + "loss": 0.0026, + "step": 5875 + }, + { + "epoch": 1.5505240261024322, + "grad_norm": 0.4217267632484436, + "learning_rate": 4.4966354400316666e-05, + "loss": 0.0364, + "step": 5880 + }, + { + "epoch": 1.5518423307626392, + "grad_norm": 0.20046105980873108, + "learning_rate": 4.483441087214673e-05, + "loss": 0.0243, + "step": 5885 + }, + { + "epoch": 1.5531606354228462, + "grad_norm": 0.004307698458433151, + "learning_rate": 4.470246734397678e-05, + "loss": 0.0064, + "step": 5890 + }, + { + "epoch": 1.5544789400830532, + "grad_norm": 0.46102187037467957, + "learning_rate": 4.457052381580683e-05, + "loss": 0.0115, + "step": 5895 + }, + { + "epoch": 1.5557972447432602, + "grad_norm": 0.0689118504524231, + "learning_rate": 4.4438580287636895e-05, + "loss": 0.0334, + "step": 5900 + }, + { + "epoch": 1.5571155494034672, + "grad_norm": 0.003091114340350032, + "learning_rate": 4.430663675946695e-05, + "loss": 0.0246, + "step": 5905 + }, + { + "epoch": 1.558433854063674, + "grad_norm": 0.003877349430695176, + "learning_rate": 4.417469323129701e-05, + "loss": 0.0032, + "step": 5910 + }, + { + "epoch": 1.5597521587238812, + "grad_norm": 0.30713143944740295, + "learning_rate": 4.404274970312706e-05, + "loss": 0.0229, + "step": 5915 + }, + { + "epoch": 1.561070463384088, + "grad_norm": 0.07344445586204529, + "learning_rate": 4.391080617495712e-05, + "loss": 0.0078, + "step": 5920 + }, + { + "epoch": 1.5623887680442952, + "grad_norm": 0.01774723082780838, + "learning_rate": 4.377886264678718e-05, + "loss": 0.0034, + "step": 5925 + }, + { + "epoch": 1.563707072704502, + "grad_norm": 0.476324200630188, + "learning_rate": 4.3646919118617236e-05, + "loss": 0.0071, + "step": 5930 + }, + { + "epoch": 1.5650253773647091, + "grad_norm": 0.11624465882778168, + "learning_rate": 4.351497559044729e-05, + "loss": 0.0236, + "step": 5935 + }, + { + "epoch": 1.566343682024916, + "grad_norm": 0.190691277384758, + "learning_rate": 4.338303206227735e-05, + "loss": 0.006, + "step": 5940 + }, + { + "epoch": 1.567661986685123, + "grad_norm": 0.20517045259475708, + "learning_rate": 4.32510885341074e-05, + "loss": 0.009, + "step": 5945 + }, + { + "epoch": 1.56898029134533, + "grad_norm": 0.008122317492961884, + "learning_rate": 4.311914500593746e-05, + "loss": 0.0041, + "step": 5950 + }, + { + "epoch": 1.570298596005537, + "grad_norm": 0.01982291042804718, + "learning_rate": 4.298720147776752e-05, + "loss": 0.0258, + "step": 5955 + }, + { + "epoch": 1.5716169006657439, + "grad_norm": 0.000996922142803669, + "learning_rate": 4.285525794959758e-05, + "loss": 0.0233, + "step": 5960 + }, + { + "epoch": 1.5729352053259509, + "grad_norm": 0.09725592285394669, + "learning_rate": 4.272331442142763e-05, + "loss": 0.0218, + "step": 5965 + }, + { + "epoch": 1.5742535099861579, + "grad_norm": 0.0672350749373436, + "learning_rate": 4.259137089325769e-05, + "loss": 0.0194, + "step": 5970 + }, + { + "epoch": 1.5755718146463646, + "grad_norm": 0.014844833873212337, + "learning_rate": 4.2459427365087744e-05, + "loss": 0.0298, + "step": 5975 + }, + { + "epoch": 1.5768901193065719, + "grad_norm": 0.030519040301442146, + "learning_rate": 4.2327483836917806e-05, + "loss": 0.0178, + "step": 5980 + }, + { + "epoch": 1.5782084239667786, + "grad_norm": 0.018561460077762604, + "learning_rate": 4.219554030874786e-05, + "loss": 0.0154, + "step": 5985 + }, + { + "epoch": 1.5795267286269858, + "grad_norm": 0.02470085583627224, + "learning_rate": 4.206359678057791e-05, + "loss": 0.0361, + "step": 5990 + }, + { + "epoch": 1.5808450332871926, + "grad_norm": 0.055412422865629196, + "learning_rate": 4.193165325240797e-05, + "loss": 0.0162, + "step": 5995 + }, + { + "epoch": 1.5821633379473996, + "grad_norm": 0.0034158769994974136, + "learning_rate": 4.179970972423803e-05, + "loss": 0.0068, + "step": 6000 + }, + { + "epoch": 1.5821633379473996, + "eval_loss": 0.024797894060611725, + "eval_runtime": 452.1611, + "eval_samples_per_second": 7.458, + "eval_steps_per_second": 3.729, + "step": 6000 + }, + { + "epoch": 1.5834816426076066, + "grad_norm": 0.01284120511263609, + "learning_rate": 4.1667766196068085e-05, + "loss": 0.0036, + "step": 6005 + }, + { + "epoch": 1.5847999472678136, + "grad_norm": 0.01274865586310625, + "learning_rate": 4.153582266789815e-05, + "loss": 0.0447, + "step": 6010 + }, + { + "epoch": 1.5861182519280206, + "grad_norm": 0.03555435314774513, + "learning_rate": 4.1403879139728196e-05, + "loss": 0.0078, + "step": 6015 + }, + { + "epoch": 1.5874365565882276, + "grad_norm": 0.0011938117677345872, + "learning_rate": 4.127193561155825e-05, + "loss": 0.0136, + "step": 6020 + }, + { + "epoch": 1.5887548612484346, + "grad_norm": 0.9741255640983582, + "learning_rate": 4.1139992083388314e-05, + "loss": 0.0153, + "step": 6025 + }, + { + "epoch": 1.5900731659086413, + "grad_norm": 0.011220674030482769, + "learning_rate": 4.100804855521837e-05, + "loss": 0.0262, + "step": 6030 + }, + { + "epoch": 1.5913914705688486, + "grad_norm": 0.021556466817855835, + "learning_rate": 4.0876105027048425e-05, + "loss": 0.0044, + "step": 6035 + }, + { + "epoch": 1.5927097752290553, + "grad_norm": 0.2725502848625183, + "learning_rate": 4.074416149887848e-05, + "loss": 0.0558, + "step": 6040 + }, + { + "epoch": 1.5940280798892625, + "grad_norm": 0.6407182216644287, + "learning_rate": 4.0612217970708537e-05, + "loss": 0.0261, + "step": 6045 + }, + { + "epoch": 1.5953463845494693, + "grad_norm": 0.0024960115551948547, + "learning_rate": 4.04802744425386e-05, + "loss": 0.0128, + "step": 6050 + }, + { + "epoch": 1.5966646892096763, + "grad_norm": 0.11380109190940857, + "learning_rate": 4.0348330914368655e-05, + "loss": 0.0199, + "step": 6055 + }, + { + "epoch": 1.5979829938698833, + "grad_norm": 0.18358005583286285, + "learning_rate": 4.0216387386198704e-05, + "loss": 0.0083, + "step": 6060 + }, + { + "epoch": 1.5993012985300903, + "grad_norm": 0.06412303447723389, + "learning_rate": 4.0084443858028766e-05, + "loss": 0.0548, + "step": 6065 + }, + { + "epoch": 1.6006196031902973, + "grad_norm": 0.6999421119689941, + "learning_rate": 3.995250032985882e-05, + "loss": 0.0074, + "step": 6070 + }, + { + "epoch": 1.6019379078505043, + "grad_norm": 0.18698133528232574, + "learning_rate": 3.982055680168888e-05, + "loss": 0.0542, + "step": 6075 + }, + { + "epoch": 1.6032562125107113, + "grad_norm": 0.014717207290232182, + "learning_rate": 3.968861327351894e-05, + "loss": 0.0071, + "step": 6080 + }, + { + "epoch": 1.604574517170918, + "grad_norm": 0.0765385851264, + "learning_rate": 3.955666974534899e-05, + "loss": 0.0063, + "step": 6085 + }, + { + "epoch": 1.6058928218311253, + "grad_norm": 0.4332450330257416, + "learning_rate": 3.9424726217179044e-05, + "loss": 0.0071, + "step": 6090 + }, + { + "epoch": 1.607211126491332, + "grad_norm": 0.003700035158544779, + "learning_rate": 3.929278268900911e-05, + "loss": 0.0052, + "step": 6095 + }, + { + "epoch": 1.6085294311515392, + "grad_norm": 0.02500278130173683, + "learning_rate": 3.916083916083916e-05, + "loss": 0.0387, + "step": 6100 + }, + { + "epoch": 1.609847735811746, + "grad_norm": 0.023568281903862953, + "learning_rate": 3.902889563266922e-05, + "loss": 0.0594, + "step": 6105 + }, + { + "epoch": 1.6111660404719532, + "grad_norm": 0.02687825821340084, + "learning_rate": 3.8896952104499274e-05, + "loss": 0.0229, + "step": 6110 + }, + { + "epoch": 1.61248434513216, + "grad_norm": 0.005178579594939947, + "learning_rate": 3.876500857632933e-05, + "loss": 0.0293, + "step": 6115 + }, + { + "epoch": 1.613802649792367, + "grad_norm": 0.3987988531589508, + "learning_rate": 3.863306504815939e-05, + "loss": 0.015, + "step": 6120 + }, + { + "epoch": 1.615120954452574, + "grad_norm": 0.18915466964244843, + "learning_rate": 3.850112151998945e-05, + "loss": 0.023, + "step": 6125 + }, + { + "epoch": 1.616439259112781, + "grad_norm": 0.015252528712153435, + "learning_rate": 3.83691779918195e-05, + "loss": 0.0185, + "step": 6130 + }, + { + "epoch": 1.617757563772988, + "grad_norm": 0.04947187379002571, + "learning_rate": 3.823723446364956e-05, + "loss": 0.0131, + "step": 6135 + }, + { + "epoch": 1.619075868433195, + "grad_norm": 0.017095958814024925, + "learning_rate": 3.8105290935479615e-05, + "loss": 0.0071, + "step": 6140 + }, + { + "epoch": 1.620394173093402, + "grad_norm": 0.013050337322056293, + "learning_rate": 3.797334740730967e-05, + "loss": 0.0038, + "step": 6145 + }, + { + "epoch": 1.6217124777536087, + "grad_norm": 0.08132806420326233, + "learning_rate": 3.784140387913973e-05, + "loss": 0.0043, + "step": 6150 + }, + { + "epoch": 1.623030782413816, + "grad_norm": 0.020741304382681847, + "learning_rate": 3.770946035096979e-05, + "loss": 0.006, + "step": 6155 + }, + { + "epoch": 1.6243490870740227, + "grad_norm": 0.0576217919588089, + "learning_rate": 3.7577516822799844e-05, + "loss": 0.0033, + "step": 6160 + }, + { + "epoch": 1.62566739173423, + "grad_norm": 0.03032900020480156, + "learning_rate": 3.74455732946299e-05, + "loss": 0.0318, + "step": 6165 + }, + { + "epoch": 1.6269856963944367, + "grad_norm": 0.8868799209594727, + "learning_rate": 3.7313629766459955e-05, + "loss": 0.0304, + "step": 6170 + }, + { + "epoch": 1.6283040010546437, + "grad_norm": 0.003816834883764386, + "learning_rate": 3.718168623829002e-05, + "loss": 0.003, + "step": 6175 + }, + { + "epoch": 1.6296223057148507, + "grad_norm": 0.05368296429514885, + "learning_rate": 3.704974271012007e-05, + "loss": 0.0064, + "step": 6180 + }, + { + "epoch": 1.6309406103750577, + "grad_norm": 0.09963366389274597, + "learning_rate": 3.691779918195012e-05, + "loss": 0.0097, + "step": 6185 + }, + { + "epoch": 1.6322589150352647, + "grad_norm": 0.006273225415498018, + "learning_rate": 3.6785855653780185e-05, + "loss": 0.0071, + "step": 6190 + }, + { + "epoch": 1.6335772196954716, + "grad_norm": 0.15079188346862793, + "learning_rate": 3.665391212561024e-05, + "loss": 0.0058, + "step": 6195 + }, + { + "epoch": 1.6348955243556786, + "grad_norm": 0.004980973433703184, + "learning_rate": 3.6521968597440296e-05, + "loss": 0.0051, + "step": 6200 + }, + { + "epoch": 1.6362138290158854, + "grad_norm": 0.004235363099724054, + "learning_rate": 3.639002506927036e-05, + "loss": 0.0028, + "step": 6205 + }, + { + "epoch": 1.6375321336760926, + "grad_norm": 0.003829963505268097, + "learning_rate": 3.625808154110041e-05, + "loss": 0.0347, + "step": 6210 + }, + { + "epoch": 1.6388504383362994, + "grad_norm": 0.021650686860084534, + "learning_rate": 3.612613801293046e-05, + "loss": 0.0036, + "step": 6215 + }, + { + "epoch": 1.6401687429965066, + "grad_norm": 0.06326934695243835, + "learning_rate": 3.5994194484760525e-05, + "loss": 0.0228, + "step": 6220 + }, + { + "epoch": 1.6414870476567134, + "grad_norm": 0.017276322469115257, + "learning_rate": 3.586225095659058e-05, + "loss": 0.0025, + "step": 6225 + }, + { + "epoch": 1.6428053523169206, + "grad_norm": 0.005066063720732927, + "learning_rate": 3.573030742842064e-05, + "loss": 0.0047, + "step": 6230 + }, + { + "epoch": 1.6441236569771274, + "grad_norm": 0.003512267954647541, + "learning_rate": 3.559836390025069e-05, + "loss": 0.0018, + "step": 6235 + }, + { + "epoch": 1.6454419616373344, + "grad_norm": 0.004347699694335461, + "learning_rate": 3.546642037208075e-05, + "loss": 0.0045, + "step": 6240 + }, + { + "epoch": 1.6467602662975414, + "grad_norm": 0.008277533575892448, + "learning_rate": 3.533447684391081e-05, + "loss": 0.0456, + "step": 6245 + }, + { + "epoch": 1.6480785709577483, + "grad_norm": 0.00973033718764782, + "learning_rate": 3.5202533315740866e-05, + "loss": 0.0215, + "step": 6250 + }, + { + "epoch": 1.6493968756179553, + "grad_norm": 1.9432978630065918, + "learning_rate": 3.507058978757092e-05, + "loss": 0.0132, + "step": 6255 + }, + { + "epoch": 1.6507151802781623, + "grad_norm": 0.2693535387516022, + "learning_rate": 3.493864625940098e-05, + "loss": 0.0037, + "step": 6260 + }, + { + "epoch": 1.6520334849383693, + "grad_norm": 0.02107766456902027, + "learning_rate": 3.480670273123103e-05, + "loss": 0.0031, + "step": 6265 + }, + { + "epoch": 1.653351789598576, + "grad_norm": 0.07168436795473099, + "learning_rate": 3.467475920306109e-05, + "loss": 0.0101, + "step": 6270 + }, + { + "epoch": 1.6546700942587833, + "grad_norm": 0.06479799002408981, + "learning_rate": 3.454281567489115e-05, + "loss": 0.0032, + "step": 6275 + }, + { + "epoch": 1.65598839891899, + "grad_norm": 0.0013557536294683814, + "learning_rate": 3.441087214672121e-05, + "loss": 0.0037, + "step": 6280 + }, + { + "epoch": 1.6573067035791973, + "grad_norm": 0.07330150157213211, + "learning_rate": 3.427892861855126e-05, + "loss": 0.0031, + "step": 6285 + }, + { + "epoch": 1.658625008239404, + "grad_norm": 0.08246012777090073, + "learning_rate": 3.414698509038132e-05, + "loss": 0.0028, + "step": 6290 + }, + { + "epoch": 1.659943312899611, + "grad_norm": 0.6232367157936096, + "learning_rate": 3.4015041562211374e-05, + "loss": 0.0042, + "step": 6295 + }, + { + "epoch": 1.661261617559818, + "grad_norm": 0.007676729932427406, + "learning_rate": 3.388309803404143e-05, + "loss": 0.0501, + "step": 6300 + }, + { + "epoch": 1.662579922220025, + "grad_norm": 0.02081216312944889, + "learning_rate": 3.375115450587149e-05, + "loss": 0.0047, + "step": 6305 + }, + { + "epoch": 1.663898226880232, + "grad_norm": 0.008829087018966675, + "learning_rate": 3.361921097770154e-05, + "loss": 0.0298, + "step": 6310 + }, + { + "epoch": 1.665216531540439, + "grad_norm": 0.4426127076148987, + "learning_rate": 3.34872674495316e-05, + "loss": 0.0045, + "step": 6315 + }, + { + "epoch": 1.666534836200646, + "grad_norm": 0.025818035006523132, + "learning_rate": 3.335532392136166e-05, + "loss": 0.0028, + "step": 6320 + }, + { + "epoch": 1.6678531408608528, + "grad_norm": 0.6068133115768433, + "learning_rate": 3.3223380393191715e-05, + "loss": 0.0202, + "step": 6325 + }, + { + "epoch": 1.66917144552106, + "grad_norm": 0.02740122564136982, + "learning_rate": 3.309143686502178e-05, + "loss": 0.0025, + "step": 6330 + }, + { + "epoch": 1.6704897501812668, + "grad_norm": 0.15878735482692719, + "learning_rate": 3.2959493336851826e-05, + "loss": 0.004, + "step": 6335 + }, + { + "epoch": 1.671808054841474, + "grad_norm": 0.006827466655522585, + "learning_rate": 3.282754980868188e-05, + "loss": 0.0048, + "step": 6340 + }, + { + "epoch": 1.6731263595016808, + "grad_norm": 0.19508551061153412, + "learning_rate": 3.2695606280511944e-05, + "loss": 0.0025, + "step": 6345 + }, + { + "epoch": 1.674444664161888, + "grad_norm": 0.8176754713058472, + "learning_rate": 3.2563662752342e-05, + "loss": 0.0151, + "step": 6350 + }, + { + "epoch": 1.6757629688220947, + "grad_norm": 0.011672024615108967, + "learning_rate": 3.2431719224172055e-05, + "loss": 0.0452, + "step": 6355 + }, + { + "epoch": 1.6770812734823017, + "grad_norm": 0.015824951231479645, + "learning_rate": 3.229977569600211e-05, + "loss": 0.0236, + "step": 6360 + }, + { + "epoch": 1.6783995781425087, + "grad_norm": 0.1358737051486969, + "learning_rate": 3.216783216783217e-05, + "loss": 0.0078, + "step": 6365 + }, + { + "epoch": 1.6797178828027157, + "grad_norm": 0.004896901547908783, + "learning_rate": 3.203588863966223e-05, + "loss": 0.0042, + "step": 6370 + }, + { + "epoch": 1.6810361874629227, + "grad_norm": 0.22593103349208832, + "learning_rate": 3.1903945111492285e-05, + "loss": 0.0053, + "step": 6375 + }, + { + "epoch": 1.6823544921231297, + "grad_norm": 0.0073196059092879295, + "learning_rate": 3.177200158332234e-05, + "loss": 0.0287, + "step": 6380 + }, + { + "epoch": 1.6836727967833367, + "grad_norm": 0.018524926155805588, + "learning_rate": 3.1640058055152396e-05, + "loss": 0.0122, + "step": 6385 + }, + { + "epoch": 1.6849911014435435, + "grad_norm": 0.7453815937042236, + "learning_rate": 3.150811452698245e-05, + "loss": 0.0378, + "step": 6390 + }, + { + "epoch": 1.6863094061037507, + "grad_norm": 0.22409795224666595, + "learning_rate": 3.137617099881251e-05, + "loss": 0.0282, + "step": 6395 + }, + { + "epoch": 1.6876277107639575, + "grad_norm": 0.005432693753391504, + "learning_rate": 3.124422747064257e-05, + "loss": 0.0162, + "step": 6400 + }, + { + "epoch": 1.6889460154241647, + "grad_norm": 0.1493055820465088, + "learning_rate": 3.1112283942472626e-05, + "loss": 0.0123, + "step": 6405 + }, + { + "epoch": 1.6902643200843714, + "grad_norm": 0.1638440042734146, + "learning_rate": 3.0980340414302674e-05, + "loss": 0.0058, + "step": 6410 + }, + { + "epoch": 1.6915826247445784, + "grad_norm": 0.015779908746480942, + "learning_rate": 3.084839688613274e-05, + "loss": 0.0157, + "step": 6415 + }, + { + "epoch": 1.6929009294047854, + "grad_norm": 0.0012348912423476577, + "learning_rate": 3.071645335796279e-05, + "loss": 0.0016, + "step": 6420 + }, + { + "epoch": 1.6942192340649924, + "grad_norm": 0.05294624716043472, + "learning_rate": 3.058450982979285e-05, + "loss": 0.0037, + "step": 6425 + }, + { + "epoch": 1.6955375387251994, + "grad_norm": 0.01926981844007969, + "learning_rate": 3.045256630162291e-05, + "loss": 0.0053, + "step": 6430 + }, + { + "epoch": 1.6968558433854064, + "grad_norm": 0.005958891473710537, + "learning_rate": 3.0320622773452963e-05, + "loss": 0.0025, + "step": 6435 + }, + { + "epoch": 1.6981741480456134, + "grad_norm": 0.001902201445773244, + "learning_rate": 3.018867924528302e-05, + "loss": 0.0027, + "step": 6440 + }, + { + "epoch": 1.6994924527058202, + "grad_norm": 0.036614127457141876, + "learning_rate": 3.0056735717113078e-05, + "loss": 0.0026, + "step": 6445 + }, + { + "epoch": 1.7008107573660274, + "grad_norm": 0.07294526696205139, + "learning_rate": 2.9924792188943133e-05, + "loss": 0.0042, + "step": 6450 + }, + { + "epoch": 1.7021290620262342, + "grad_norm": 0.42822372913360596, + "learning_rate": 2.9792848660773192e-05, + "loss": 0.013, + "step": 6455 + }, + { + "epoch": 1.7034473666864414, + "grad_norm": 0.036622967571020126, + "learning_rate": 2.9660905132603245e-05, + "loss": 0.0029, + "step": 6460 + }, + { + "epoch": 1.7047656713466481, + "grad_norm": 0.08314034342765808, + "learning_rate": 2.9528961604433304e-05, + "loss": 0.0043, + "step": 6465 + }, + { + "epoch": 1.7060839760068551, + "grad_norm": 0.0005654952838085592, + "learning_rate": 2.939701807626336e-05, + "loss": 0.0595, + "step": 6470 + }, + { + "epoch": 1.7074022806670621, + "grad_norm": 0.004545385017991066, + "learning_rate": 2.926507454809342e-05, + "loss": 0.0044, + "step": 6475 + }, + { + "epoch": 1.7087205853272691, + "grad_norm": 0.00033831383916549385, + "learning_rate": 2.9133131019923477e-05, + "loss": 0.0046, + "step": 6480 + }, + { + "epoch": 1.710038889987476, + "grad_norm": 0.0019903562497347593, + "learning_rate": 2.900118749175353e-05, + "loss": 0.0026, + "step": 6485 + }, + { + "epoch": 1.711357194647683, + "grad_norm": 0.10188104957342148, + "learning_rate": 2.8869243963583585e-05, + "loss": 0.0069, + "step": 6490 + }, + { + "epoch": 1.71267549930789, + "grad_norm": 0.2123432606458664, + "learning_rate": 2.8737300435413644e-05, + "loss": 0.0199, + "step": 6495 + }, + { + "epoch": 1.7139938039680969, + "grad_norm": 0.43209517002105713, + "learning_rate": 2.8605356907243703e-05, + "loss": 0.0099, + "step": 6500 + }, + { + "epoch": 1.7139938039680969, + "eval_loss": 0.024327505379915237, + "eval_runtime": 452.0052, + "eval_samples_per_second": 7.46, + "eval_steps_per_second": 3.73, + "step": 6500 + }, + { + "epoch": 1.715312108628304, + "grad_norm": 0.009868285618722439, + "learning_rate": 2.847341337907376e-05, + "loss": 0.0025, + "step": 6505 + }, + { + "epoch": 1.7166304132885108, + "grad_norm": 0.00778606254607439, + "learning_rate": 2.834146985090381e-05, + "loss": 0.0028, + "step": 6510 + }, + { + "epoch": 1.717948717948718, + "grad_norm": 0.02987460047006607, + "learning_rate": 2.820952632273387e-05, + "loss": 0.0068, + "step": 6515 + }, + { + "epoch": 1.7192670226089248, + "grad_norm": 0.04475142061710358, + "learning_rate": 2.807758279456393e-05, + "loss": 0.0022, + "step": 6520 + }, + { + "epoch": 1.720585327269132, + "grad_norm": 0.12720516324043274, + "learning_rate": 2.7945639266393985e-05, + "loss": 0.0488, + "step": 6525 + }, + { + "epoch": 1.7219036319293388, + "grad_norm": 0.0011463731061667204, + "learning_rate": 2.7813695738224044e-05, + "loss": 0.0023, + "step": 6530 + }, + { + "epoch": 1.7232219365895458, + "grad_norm": 0.008907752111554146, + "learning_rate": 2.7681752210054096e-05, + "loss": 0.0039, + "step": 6535 + }, + { + "epoch": 1.7245402412497528, + "grad_norm": 0.008416680619120598, + "learning_rate": 2.7549808681884156e-05, + "loss": 0.0055, + "step": 6540 + }, + { + "epoch": 1.7258585459099598, + "grad_norm": 0.26278871297836304, + "learning_rate": 2.741786515371421e-05, + "loss": 0.0386, + "step": 6545 + }, + { + "epoch": 1.7271768505701668, + "grad_norm": 0.01750275492668152, + "learning_rate": 2.728592162554427e-05, + "loss": 0.0048, + "step": 6550 + }, + { + "epoch": 1.7284951552303738, + "grad_norm": 0.009483959525823593, + "learning_rate": 2.7153978097374326e-05, + "loss": 0.0061, + "step": 6555 + }, + { + "epoch": 1.7298134598905808, + "grad_norm": 0.016591722145676613, + "learning_rate": 2.7022034569204378e-05, + "loss": 0.0058, + "step": 6560 + }, + { + "epoch": 1.7311317645507875, + "grad_norm": 0.5120682716369629, + "learning_rate": 2.6890091041034437e-05, + "loss": 0.0229, + "step": 6565 + }, + { + "epoch": 1.7324500692109948, + "grad_norm": 0.03748248517513275, + "learning_rate": 2.6758147512864496e-05, + "loss": 0.0026, + "step": 6570 + }, + { + "epoch": 1.7337683738712015, + "grad_norm": 0.08328749984502792, + "learning_rate": 2.6626203984694552e-05, + "loss": 0.0052, + "step": 6575 + }, + { + "epoch": 1.7350866785314087, + "grad_norm": 0.012284482829272747, + "learning_rate": 2.649426045652461e-05, + "loss": 0.0353, + "step": 6580 + }, + { + "epoch": 1.7364049831916155, + "grad_norm": 0.06362583488225937, + "learning_rate": 2.6362316928354663e-05, + "loss": 0.0309, + "step": 6585 + }, + { + "epoch": 1.7377232878518225, + "grad_norm": 0.01475360058248043, + "learning_rate": 2.6230373400184722e-05, + "loss": 0.0034, + "step": 6590 + }, + { + "epoch": 1.7390415925120295, + "grad_norm": 0.002241638721898198, + "learning_rate": 2.6098429872014778e-05, + "loss": 0.0365, + "step": 6595 + }, + { + "epoch": 1.7403598971722365, + "grad_norm": 0.11375941336154938, + "learning_rate": 2.5966486343844837e-05, + "loss": 0.0241, + "step": 6600 + }, + { + "epoch": 1.7416782018324435, + "grad_norm": 0.009631779976189137, + "learning_rate": 2.5834542815674896e-05, + "loss": 0.0026, + "step": 6605 + }, + { + "epoch": 1.7429965064926505, + "grad_norm": 0.12113262712955475, + "learning_rate": 2.570259928750495e-05, + "loss": 0.0207, + "step": 6610 + }, + { + "epoch": 1.7443148111528575, + "grad_norm": 0.006536155007779598, + "learning_rate": 2.5570655759335004e-05, + "loss": 0.0022, + "step": 6615 + }, + { + "epoch": 1.7456331158130642, + "grad_norm": 0.043030887842178345, + "learning_rate": 2.5438712231165063e-05, + "loss": 0.003, + "step": 6620 + }, + { + "epoch": 1.7469514204732715, + "grad_norm": 0.00860620103776455, + "learning_rate": 2.5306768702995122e-05, + "loss": 0.027, + "step": 6625 + }, + { + "epoch": 1.7482697251334782, + "grad_norm": 0.014589210972189903, + "learning_rate": 2.5174825174825178e-05, + "loss": 0.0224, + "step": 6630 + }, + { + "epoch": 1.7495880297936854, + "grad_norm": 0.01215316355228424, + "learning_rate": 2.504288164665523e-05, + "loss": 0.011, + "step": 6635 + }, + { + "epoch": 1.7509063344538922, + "grad_norm": 0.10951556265354156, + "learning_rate": 2.491093811848529e-05, + "loss": 0.0384, + "step": 6640 + }, + { + "epoch": 1.7522246391140994, + "grad_norm": 0.30859875679016113, + "learning_rate": 2.4778994590315345e-05, + "loss": 0.0031, + "step": 6645 + }, + { + "epoch": 1.7535429437743062, + "grad_norm": 0.025427229702472687, + "learning_rate": 2.4647051062145404e-05, + "loss": 0.0171, + "step": 6650 + }, + { + "epoch": 1.7548612484345132, + "grad_norm": 0.03334197774529457, + "learning_rate": 2.451510753397546e-05, + "loss": 0.0473, + "step": 6655 + }, + { + "epoch": 1.7561795530947202, + "grad_norm": 0.013445639982819557, + "learning_rate": 2.438316400580552e-05, + "loss": 0.0056, + "step": 6660 + }, + { + "epoch": 1.7574978577549272, + "grad_norm": 0.008306960575282574, + "learning_rate": 2.425122047763557e-05, + "loss": 0.0104, + "step": 6665 + }, + { + "epoch": 1.7588161624151342, + "grad_norm": 0.012615012936294079, + "learning_rate": 2.411927694946563e-05, + "loss": 0.0097, + "step": 6670 + }, + { + "epoch": 1.7601344670753412, + "grad_norm": 0.006827410310506821, + "learning_rate": 2.398733342129569e-05, + "loss": 0.0057, + "step": 6675 + }, + { + "epoch": 1.7614527717355482, + "grad_norm": 0.017035294324159622, + "learning_rate": 2.3855389893125745e-05, + "loss": 0.0035, + "step": 6680 + }, + { + "epoch": 1.762771076395755, + "grad_norm": 0.036102693527936935, + "learning_rate": 2.37234463649558e-05, + "loss": 0.0031, + "step": 6685 + }, + { + "epoch": 1.7640893810559621, + "grad_norm": 0.5004498958587646, + "learning_rate": 2.3591502836785856e-05, + "loss": 0.0217, + "step": 6690 + }, + { + "epoch": 1.765407685716169, + "grad_norm": 0.017726672813296318, + "learning_rate": 2.3459559308615915e-05, + "loss": 0.0112, + "step": 6695 + }, + { + "epoch": 1.7667259903763761, + "grad_norm": 0.00940331444144249, + "learning_rate": 2.332761578044597e-05, + "loss": 0.0107, + "step": 6700 + }, + { + "epoch": 1.768044295036583, + "grad_norm": 0.007495497819036245, + "learning_rate": 2.3195672252276026e-05, + "loss": 0.0032, + "step": 6705 + }, + { + "epoch": 1.7693625996967899, + "grad_norm": 0.6863199472427368, + "learning_rate": 2.3063728724106085e-05, + "loss": 0.034, + "step": 6710 + }, + { + "epoch": 1.7706809043569969, + "grad_norm": 0.004587489180266857, + "learning_rate": 2.293178519593614e-05, + "loss": 0.0032, + "step": 6715 + }, + { + "epoch": 1.7719992090172039, + "grad_norm": 0.017706016078591347, + "learning_rate": 2.2799841667766197e-05, + "loss": 0.0036, + "step": 6720 + }, + { + "epoch": 1.7733175136774109, + "grad_norm": 0.012740216217935085, + "learning_rate": 2.2667898139596252e-05, + "loss": 0.0147, + "step": 6725 + }, + { + "epoch": 1.7746358183376179, + "grad_norm": 0.010391579940915108, + "learning_rate": 2.253595461142631e-05, + "loss": 0.0041, + "step": 6730 + }, + { + "epoch": 1.7759541229978248, + "grad_norm": 0.021570540964603424, + "learning_rate": 2.2404011083256367e-05, + "loss": 0.0363, + "step": 6735 + }, + { + "epoch": 1.7772724276580316, + "grad_norm": 0.005778402555733919, + "learning_rate": 2.2272067555086423e-05, + "loss": 0.002, + "step": 6740 + }, + { + "epoch": 1.7785907323182388, + "grad_norm": 0.0, + "learning_rate": 2.2140124026916482e-05, + "loss": 0.0058, + "step": 6745 + }, + { + "epoch": 1.7799090369784456, + "grad_norm": 0.010869967751204967, + "learning_rate": 2.2008180498746537e-05, + "loss": 0.0036, + "step": 6750 + }, + { + "epoch": 1.7812273416386528, + "grad_norm": 0.04336518794298172, + "learning_rate": 2.1876236970576593e-05, + "loss": 0.0074, + "step": 6755 + }, + { + "epoch": 1.7825456462988596, + "grad_norm": 0.008664094842970371, + "learning_rate": 2.1744293442406652e-05, + "loss": 0.0027, + "step": 6760 + }, + { + "epoch": 1.7838639509590668, + "grad_norm": 0.9408183097839355, + "learning_rate": 2.1612349914236708e-05, + "loss": 0.0371, + "step": 6765 + }, + { + "epoch": 1.7851822556192736, + "grad_norm": 0.016822539269924164, + "learning_rate": 2.1480406386066763e-05, + "loss": 0.0137, + "step": 6770 + }, + { + "epoch": 1.7865005602794806, + "grad_norm": 0.00829544197767973, + "learning_rate": 2.134846285789682e-05, + "loss": 0.0134, + "step": 6775 + }, + { + "epoch": 1.7878188649396876, + "grad_norm": 0.0035508016590029, + "learning_rate": 2.1216519329726878e-05, + "loss": 0.0231, + "step": 6780 + }, + { + "epoch": 1.7891371695998946, + "grad_norm": 0.13871321082115173, + "learning_rate": 2.1084575801556937e-05, + "loss": 0.0296, + "step": 6785 + }, + { + "epoch": 1.7904554742601015, + "grad_norm": 0.002578354673460126, + "learning_rate": 2.095263227338699e-05, + "loss": 0.0178, + "step": 6790 + }, + { + "epoch": 1.7917737789203085, + "grad_norm": 0.5279458165168762, + "learning_rate": 2.082068874521705e-05, + "loss": 0.0336, + "step": 6795 + }, + { + "epoch": 1.7930920835805155, + "grad_norm": 0.0017439400544390082, + "learning_rate": 2.0688745217047104e-05, + "loss": 0.0031, + "step": 6800 + }, + { + "epoch": 1.7944103882407223, + "grad_norm": 0.007989778183400631, + "learning_rate": 2.055680168887716e-05, + "loss": 0.0081, + "step": 6805 + }, + { + "epoch": 1.7957286929009295, + "grad_norm": 0.015163813717663288, + "learning_rate": 2.042485816070722e-05, + "loss": 0.0234, + "step": 6810 + }, + { + "epoch": 1.7970469975611363, + "grad_norm": 0.10615389794111252, + "learning_rate": 2.0292914632537275e-05, + "loss": 0.0144, + "step": 6815 + }, + { + "epoch": 1.7983653022213435, + "grad_norm": 0.03466172143816948, + "learning_rate": 2.0160971104367334e-05, + "loss": 0.0036, + "step": 6820 + }, + { + "epoch": 1.7996836068815503, + "grad_norm": 0.047511328011751175, + "learning_rate": 2.0029027576197386e-05, + "loss": 0.002, + "step": 6825 + }, + { + "epoch": 1.8010019115417573, + "grad_norm": 0.019772246479988098, + "learning_rate": 1.9897084048027445e-05, + "loss": 0.0049, + "step": 6830 + }, + { + "epoch": 1.8023202162019643, + "grad_norm": 0.1156701073050499, + "learning_rate": 1.9765140519857504e-05, + "loss": 0.0033, + "step": 6835 + }, + { + "epoch": 1.8036385208621712, + "grad_norm": 0.010991690680384636, + "learning_rate": 1.963319699168756e-05, + "loss": 0.0036, + "step": 6840 + }, + { + "epoch": 1.8049568255223782, + "grad_norm": 0.29658815264701843, + "learning_rate": 1.9501253463517615e-05, + "loss": 0.0042, + "step": 6845 + }, + { + "epoch": 1.8062751301825852, + "grad_norm": 0.056147243827581406, + "learning_rate": 1.936930993534767e-05, + "loss": 0.0052, + "step": 6850 + }, + { + "epoch": 1.8075934348427922, + "grad_norm": 0.010382590815424919, + "learning_rate": 1.923736640717773e-05, + "loss": 0.0033, + "step": 6855 + }, + { + "epoch": 1.808911739502999, + "grad_norm": 1.1247020959854126, + "learning_rate": 1.9105422879007786e-05, + "loss": 0.0112, + "step": 6860 + }, + { + "epoch": 1.8102300441632062, + "grad_norm": 1.4515737295150757, + "learning_rate": 1.897347935083784e-05, + "loss": 0.0202, + "step": 6865 + }, + { + "epoch": 1.811548348823413, + "grad_norm": 0.016307830810546875, + "learning_rate": 1.88415358226679e-05, + "loss": 0.0148, + "step": 6870 + }, + { + "epoch": 1.8128666534836202, + "grad_norm": 0.0745878592133522, + "learning_rate": 1.8709592294497956e-05, + "loss": 0.0062, + "step": 6875 + }, + { + "epoch": 1.814184958143827, + "grad_norm": 0.02554013952612877, + "learning_rate": 1.8577648766328012e-05, + "loss": 0.003, + "step": 6880 + }, + { + "epoch": 1.815503262804034, + "grad_norm": 0.45748665928840637, + "learning_rate": 1.844570523815807e-05, + "loss": 0.0386, + "step": 6885 + }, + { + "epoch": 1.816821567464241, + "grad_norm": 0.013801589608192444, + "learning_rate": 1.8313761709988126e-05, + "loss": 0.0342, + "step": 6890 + }, + { + "epoch": 1.818139872124448, + "grad_norm": 0.6251696944236755, + "learning_rate": 1.8181818181818182e-05, + "loss": 0.0101, + "step": 6895 + }, + { + "epoch": 1.819458176784655, + "grad_norm": 0.28203102946281433, + "learning_rate": 1.8049874653648238e-05, + "loss": 0.0032, + "step": 6900 + }, + { + "epoch": 1.820776481444862, + "grad_norm": 0.28511062264442444, + "learning_rate": 1.7917931125478297e-05, + "loss": 0.0343, + "step": 6905 + }, + { + "epoch": 1.822094786105069, + "grad_norm": 0.004940215498209, + "learning_rate": 1.7785987597308352e-05, + "loss": 0.0265, + "step": 6910 + }, + { + "epoch": 1.8234130907652757, + "grad_norm": 0.002903093583881855, + "learning_rate": 1.7654044069138408e-05, + "loss": 0.0025, + "step": 6915 + }, + { + "epoch": 1.824731395425483, + "grad_norm": 0.008801674470305443, + "learning_rate": 1.7522100540968467e-05, + "loss": 0.0246, + "step": 6920 + }, + { + "epoch": 1.8260497000856897, + "grad_norm": 0.13823826611042023, + "learning_rate": 1.7390157012798523e-05, + "loss": 0.0058, + "step": 6925 + }, + { + "epoch": 1.827368004745897, + "grad_norm": 0.020868878811597824, + "learning_rate": 1.725821348462858e-05, + "loss": 0.0014, + "step": 6930 + }, + { + "epoch": 1.8286863094061037, + "grad_norm": 0.0027356524951756, + "learning_rate": 1.7126269956458638e-05, + "loss": 0.0035, + "step": 6935 + }, + { + "epoch": 1.8300046140663109, + "grad_norm": 0.06023023650050163, + "learning_rate": 1.6994326428288693e-05, + "loss": 0.0212, + "step": 6940 + }, + { + "epoch": 1.8313229187265176, + "grad_norm": 0.0009826788445934653, + "learning_rate": 1.686238290011875e-05, + "loss": 0.0034, + "step": 6945 + }, + { + "epoch": 1.8326412233867246, + "grad_norm": 0.2867647707462311, + "learning_rate": 1.6730439371948805e-05, + "loss": 0.0146, + "step": 6950 + }, + { + "epoch": 1.8339595280469316, + "grad_norm": 0.004501632414758205, + "learning_rate": 1.6598495843778864e-05, + "loss": 0.0026, + "step": 6955 + }, + { + "epoch": 1.8352778327071386, + "grad_norm": 0.01251616608351469, + "learning_rate": 1.6466552315608923e-05, + "loss": 0.0107, + "step": 6960 + }, + { + "epoch": 1.8365961373673456, + "grad_norm": 0.054781850427389145, + "learning_rate": 1.6334608787438975e-05, + "loss": 0.0044, + "step": 6965 + }, + { + "epoch": 1.8379144420275526, + "grad_norm": 0.1120501235127449, + "learning_rate": 1.6202665259269034e-05, + "loss": 0.0284, + "step": 6970 + }, + { + "epoch": 1.8392327466877596, + "grad_norm": 0.001668553682975471, + "learning_rate": 1.607072173109909e-05, + "loss": 0.0169, + "step": 6975 + }, + { + "epoch": 1.8405510513479664, + "grad_norm": 1.6374458074569702, + "learning_rate": 1.593877820292915e-05, + "loss": 0.031, + "step": 6980 + }, + { + "epoch": 1.8418693560081736, + "grad_norm": 0.012474550865590572, + "learning_rate": 1.5806834674759204e-05, + "loss": 0.0037, + "step": 6985 + }, + { + "epoch": 1.8431876606683804, + "grad_norm": 0.014898869208991528, + "learning_rate": 1.567489114658926e-05, + "loss": 0.003, + "step": 6990 + }, + { + "epoch": 1.8445059653285876, + "grad_norm": 0.035570453852415085, + "learning_rate": 1.554294761841932e-05, + "loss": 0.0038, + "step": 6995 + }, + { + "epoch": 1.8458242699887943, + "grad_norm": 0.9279152750968933, + "learning_rate": 1.541100409024937e-05, + "loss": 0.0235, + "step": 7000 + }, + { + "epoch": 1.8458242699887943, + "eval_loss": 0.022339830175042152, + "eval_runtime": 451.9068, + "eval_samples_per_second": 7.462, + "eval_steps_per_second": 3.731, + "step": 7000 + }, + { + "epoch": 1.8471425746490013, + "grad_norm": 0.0551234595477581, + "learning_rate": 1.527906056207943e-05, + "loss": 0.0111, + "step": 7005 + }, + { + "epoch": 1.8484608793092083, + "grad_norm": 0.011982420459389687, + "learning_rate": 1.514711703390949e-05, + "loss": 0.0279, + "step": 7010 + }, + { + "epoch": 1.8497791839694153, + "grad_norm": 0.0005129808560013771, + "learning_rate": 1.5015173505739543e-05, + "loss": 0.006, + "step": 7015 + }, + { + "epoch": 1.8510974886296223, + "grad_norm": 0.00803748145699501, + "learning_rate": 1.4883229977569602e-05, + "loss": 0.0039, + "step": 7020 + }, + { + "epoch": 1.8524157932898293, + "grad_norm": 0.012161086313426495, + "learning_rate": 1.4751286449399656e-05, + "loss": 0.002, + "step": 7025 + }, + { + "epoch": 1.8537340979500363, + "grad_norm": 0.09517266601324081, + "learning_rate": 1.4619342921229714e-05, + "loss": 0.0021, + "step": 7030 + }, + { + "epoch": 1.855052402610243, + "grad_norm": 0.024397969245910645, + "learning_rate": 1.4487399393059773e-05, + "loss": 0.0031, + "step": 7035 + }, + { + "epoch": 1.8563707072704503, + "grad_norm": 0.010253255255520344, + "learning_rate": 1.4355455864889827e-05, + "loss": 0.005, + "step": 7040 + }, + { + "epoch": 1.857689011930657, + "grad_norm": 0.30331942439079285, + "learning_rate": 1.4223512336719886e-05, + "loss": 0.0048, + "step": 7045 + }, + { + "epoch": 1.8590073165908643, + "grad_norm": 0.4087940454483032, + "learning_rate": 1.409156880854994e-05, + "loss": 0.0375, + "step": 7050 + }, + { + "epoch": 1.860325621251071, + "grad_norm": 0.01011588517576456, + "learning_rate": 1.3959625280379999e-05, + "loss": 0.0218, + "step": 7055 + }, + { + "epoch": 1.8616439259112783, + "grad_norm": 0.004677001852542162, + "learning_rate": 1.3827681752210056e-05, + "loss": 0.0183, + "step": 7060 + }, + { + "epoch": 1.862962230571485, + "grad_norm": 0.8648074269294739, + "learning_rate": 1.3695738224040112e-05, + "loss": 0.0243, + "step": 7065 + }, + { + "epoch": 1.864280535231692, + "grad_norm": 0.00018874031957238913, + "learning_rate": 1.356379469587017e-05, + "loss": 0.0204, + "step": 7070 + }, + { + "epoch": 1.865598839891899, + "grad_norm": 0.010363743640482426, + "learning_rate": 1.3431851167700223e-05, + "loss": 0.0151, + "step": 7075 + }, + { + "epoch": 1.866917144552106, + "grad_norm": 0.015046795830130577, + "learning_rate": 1.3299907639530282e-05, + "loss": 0.0028, + "step": 7080 + }, + { + "epoch": 1.868235449212313, + "grad_norm": 0.00892347190529108, + "learning_rate": 1.316796411136034e-05, + "loss": 0.0076, + "step": 7085 + }, + { + "epoch": 1.86955375387252, + "grad_norm": 0.011039508506655693, + "learning_rate": 1.3036020583190395e-05, + "loss": 0.0041, + "step": 7090 + }, + { + "epoch": 1.870872058532727, + "grad_norm": 0.612829864025116, + "learning_rate": 1.2904077055020453e-05, + "loss": 0.0406, + "step": 7095 + }, + { + "epoch": 1.8721903631929337, + "grad_norm": 0.02630307897925377, + "learning_rate": 1.2772133526850508e-05, + "loss": 0.005, + "step": 7100 + }, + { + "epoch": 1.873508667853141, + "grad_norm": 0.05626239255070686, + "learning_rate": 1.2640189998680566e-05, + "loss": 0.0494, + "step": 7105 + }, + { + "epoch": 1.8748269725133477, + "grad_norm": 0.009870803914964199, + "learning_rate": 1.2508246470510623e-05, + "loss": 0.0036, + "step": 7110 + }, + { + "epoch": 1.876145277173555, + "grad_norm": 0.0034679556265473366, + "learning_rate": 1.2376302942340679e-05, + "loss": 0.0026, + "step": 7115 + }, + { + "epoch": 1.8774635818337617, + "grad_norm": 0.0021383303683251143, + "learning_rate": 1.2244359414170734e-05, + "loss": 0.0028, + "step": 7120 + }, + { + "epoch": 1.8787818864939687, + "grad_norm": 0.016683265566825867, + "learning_rate": 1.2112415886000793e-05, + "loss": 0.0027, + "step": 7125 + }, + { + "epoch": 1.8801001911541757, + "grad_norm": 0.30483224987983704, + "learning_rate": 1.1980472357830849e-05, + "loss": 0.0081, + "step": 7130 + }, + { + "epoch": 1.8814184958143827, + "grad_norm": 0.055007629096508026, + "learning_rate": 1.1848528829660906e-05, + "loss": 0.0058, + "step": 7135 + }, + { + "epoch": 1.8827368004745897, + "grad_norm": 0.013665193691849709, + "learning_rate": 1.1716585301490962e-05, + "loss": 0.0289, + "step": 7140 + }, + { + "epoch": 1.8840551051347967, + "grad_norm": 0.004984239581972361, + "learning_rate": 1.158464177332102e-05, + "loss": 0.0049, + "step": 7145 + }, + { + "epoch": 1.8853734097950037, + "grad_norm": 0.4533900320529938, + "learning_rate": 1.1452698245151077e-05, + "loss": 0.0127, + "step": 7150 + }, + { + "epoch": 1.8866917144552104, + "grad_norm": 0.8876304030418396, + "learning_rate": 1.1320754716981132e-05, + "loss": 0.0124, + "step": 7155 + }, + { + "epoch": 1.8880100191154177, + "grad_norm": 0.004243049304932356, + "learning_rate": 1.118881118881119e-05, + "loss": 0.0034, + "step": 7160 + }, + { + "epoch": 1.8893283237756244, + "grad_norm": 0.0425611287355423, + "learning_rate": 1.1056867660641245e-05, + "loss": 0.0108, + "step": 7165 + }, + { + "epoch": 1.8906466284358316, + "grad_norm": 0.005729519762098789, + "learning_rate": 1.0924924132471303e-05, + "loss": 0.0127, + "step": 7170 + }, + { + "epoch": 1.8919649330960384, + "grad_norm": 0.02838645875453949, + "learning_rate": 1.079298060430136e-05, + "loss": 0.0055, + "step": 7175 + }, + { + "epoch": 1.8932832377562456, + "grad_norm": 0.007703767623752356, + "learning_rate": 1.0661037076131416e-05, + "loss": 0.0029, + "step": 7180 + }, + { + "epoch": 1.8946015424164524, + "grad_norm": 0.00507679907605052, + "learning_rate": 1.0529093547961473e-05, + "loss": 0.0019, + "step": 7185 + }, + { + "epoch": 1.8959198470766594, + "grad_norm": 0.3672322630882263, + "learning_rate": 1.0397150019791529e-05, + "loss": 0.0502, + "step": 7190 + }, + { + "epoch": 1.8972381517368664, + "grad_norm": 0.002695564879104495, + "learning_rate": 1.0265206491621586e-05, + "loss": 0.0022, + "step": 7195 + }, + { + "epoch": 1.8985564563970734, + "grad_norm": 0.6876901984214783, + "learning_rate": 1.0133262963451644e-05, + "loss": 0.0384, + "step": 7200 + }, + { + "epoch": 1.8998747610572804, + "grad_norm": 0.0224466510117054, + "learning_rate": 1.0001319435281701e-05, + "loss": 0.0019, + "step": 7205 + }, + { + "epoch": 1.9011930657174874, + "grad_norm": 0.21176137030124664, + "learning_rate": 9.869375907111757e-06, + "loss": 0.0214, + "step": 7210 + }, + { + "epoch": 1.9025113703776944, + "grad_norm": 0.01078563928604126, + "learning_rate": 9.737432378941814e-06, + "loss": 0.0045, + "step": 7215 + }, + { + "epoch": 1.9038296750379011, + "grad_norm": 0.0053437380120158195, + "learning_rate": 9.60548885077187e-06, + "loss": 0.0038, + "step": 7220 + }, + { + "epoch": 1.9051479796981083, + "grad_norm": 0.019259070977568626, + "learning_rate": 9.473545322601927e-06, + "loss": 0.0025, + "step": 7225 + }, + { + "epoch": 1.906466284358315, + "grad_norm": 0.01319583784788847, + "learning_rate": 9.341601794431984e-06, + "loss": 0.0036, + "step": 7230 + }, + { + "epoch": 1.9077845890185223, + "grad_norm": 0.012393418699502945, + "learning_rate": 9.20965826626204e-06, + "loss": 0.0033, + "step": 7235 + }, + { + "epoch": 1.909102893678729, + "grad_norm": 0.37064847350120544, + "learning_rate": 9.077714738092097e-06, + "loss": 0.039, + "step": 7240 + }, + { + "epoch": 1.910421198338936, + "grad_norm": 0.012969265691936016, + "learning_rate": 8.945771209922153e-06, + "loss": 0.0023, + "step": 7245 + }, + { + "epoch": 1.911739502999143, + "grad_norm": 0.187465637922287, + "learning_rate": 8.81382768175221e-06, + "loss": 0.0176, + "step": 7250 + }, + { + "epoch": 1.91305780765935, + "grad_norm": 0.22874793410301208, + "learning_rate": 8.681884153582268e-06, + "loss": 0.0028, + "step": 7255 + }, + { + "epoch": 1.914376112319557, + "grad_norm": 0.015071459114551544, + "learning_rate": 8.549940625412323e-06, + "loss": 0.0197, + "step": 7260 + }, + { + "epoch": 1.915694416979764, + "grad_norm": 0.0037113677244633436, + "learning_rate": 8.41799709724238e-06, + "loss": 0.0026, + "step": 7265 + }, + { + "epoch": 1.917012721639971, + "grad_norm": 0.1318834125995636, + "learning_rate": 8.286053569072436e-06, + "loss": 0.0025, + "step": 7270 + }, + { + "epoch": 1.9183310263001778, + "grad_norm": 0.025826094672083855, + "learning_rate": 8.154110040902495e-06, + "loss": 0.0031, + "step": 7275 + }, + { + "epoch": 1.919649330960385, + "grad_norm": 0.02691330574452877, + "learning_rate": 8.022166512732551e-06, + "loss": 0.0038, + "step": 7280 + }, + { + "epoch": 1.9209676356205918, + "grad_norm": 0.026079120114445686, + "learning_rate": 7.890222984562608e-06, + "loss": 0.0035, + "step": 7285 + }, + { + "epoch": 1.922285940280799, + "grad_norm": 0.20154571533203125, + "learning_rate": 7.758279456392664e-06, + "loss": 0.0031, + "step": 7290 + }, + { + "epoch": 1.9236042449410058, + "grad_norm": 0.000983367906883359, + "learning_rate": 7.6263359282227206e-06, + "loss": 0.0048, + "step": 7295 + }, + { + "epoch": 1.924922549601213, + "grad_norm": 0.018915656954050064, + "learning_rate": 7.494392400052777e-06, + "loss": 0.0028, + "step": 7300 + }, + { + "epoch": 1.9262408542614198, + "grad_norm": 0.028158968314528465, + "learning_rate": 7.362448871882835e-06, + "loss": 0.0055, + "step": 7305 + }, + { + "epoch": 1.9275591589216268, + "grad_norm": 0.00263324286788702, + "learning_rate": 7.230505343712892e-06, + "loss": 0.0052, + "step": 7310 + }, + { + "epoch": 1.9288774635818338, + "grad_norm": 0.009877101518213749, + "learning_rate": 7.0985618155429475e-06, + "loss": 0.0023, + "step": 7315 + }, + { + "epoch": 1.9301957682420408, + "grad_norm": 0.0, + "learning_rate": 6.966618287373004e-06, + "loss": 0.0359, + "step": 7320 + }, + { + "epoch": 1.9315140729022477, + "grad_norm": 0.458391398191452, + "learning_rate": 6.8346747592030605e-06, + "loss": 0.0074, + "step": 7325 + }, + { + "epoch": 1.9328323775624545, + "grad_norm": 0.01231459341943264, + "learning_rate": 6.702731231033119e-06, + "loss": 0.0037, + "step": 7330 + }, + { + "epoch": 1.9341506822226617, + "grad_norm": 0.020665613934397697, + "learning_rate": 6.570787702863175e-06, + "loss": 0.0042, + "step": 7335 + }, + { + "epoch": 1.9354689868828685, + "grad_norm": 0.10898768156766891, + "learning_rate": 6.438844174693232e-06, + "loss": 0.0231, + "step": 7340 + }, + { + "epoch": 1.9367872915430757, + "grad_norm": 0.007229386828839779, + "learning_rate": 6.306900646523288e-06, + "loss": 0.0046, + "step": 7345 + }, + { + "epoch": 1.9381055962032825, + "grad_norm": 0.06096978858113289, + "learning_rate": 6.1749571183533456e-06, + "loss": 0.0032, + "step": 7350 + }, + { + "epoch": 1.9394239008634897, + "grad_norm": 0.0034814421087503433, + "learning_rate": 6.043013590183401e-06, + "loss": 0.0131, + "step": 7355 + }, + { + "epoch": 1.9407422055236965, + "grad_norm": 0.04464314505457878, + "learning_rate": 5.9110700620134586e-06, + "loss": 0.0075, + "step": 7360 + }, + { + "epoch": 1.9420605101839035, + "grad_norm": 0.024586567655205727, + "learning_rate": 5.779126533843515e-06, + "loss": 0.0051, + "step": 7365 + }, + { + "epoch": 1.9433788148441105, + "grad_norm": 0.1543113738298416, + "learning_rate": 5.647183005673572e-06, + "loss": 0.0037, + "step": 7370 + }, + { + "epoch": 1.9446971195043175, + "grad_norm": 0.2567637860774994, + "learning_rate": 5.515239477503629e-06, + "loss": 0.0407, + "step": 7375 + }, + { + "epoch": 1.9460154241645244, + "grad_norm": 0.00815210398286581, + "learning_rate": 5.3832959493336855e-06, + "loss": 0.0048, + "step": 7380 + }, + { + "epoch": 1.9473337288247314, + "grad_norm": 0.010180729441344738, + "learning_rate": 5.251352421163743e-06, + "loss": 0.002, + "step": 7385 + }, + { + "epoch": 1.9486520334849384, + "grad_norm": 0.01142155285924673, + "learning_rate": 5.1194088929937985e-06, + "loss": 0.0075, + "step": 7390 + }, + { + "epoch": 1.9499703381451452, + "grad_norm": 0.005243134684860706, + "learning_rate": 4.987465364823855e-06, + "loss": 0.002, + "step": 7395 + }, + { + "epoch": 1.9512886428053524, + "grad_norm": 0.289771169424057, + "learning_rate": 4.855521836653912e-06, + "loss": 0.016, + "step": 7400 + }, + { + "epoch": 1.9526069474655592, + "grad_norm": 0.0042951651848852634, + "learning_rate": 4.723578308483969e-06, + "loss": 0.0301, + "step": 7405 + }, + { + "epoch": 1.9539252521257664, + "grad_norm": 0.27518171072006226, + "learning_rate": 4.591634780314026e-06, + "loss": 0.0059, + "step": 7410 + }, + { + "epoch": 1.9552435567859732, + "grad_norm": 0.13695034384727478, + "learning_rate": 4.459691252144083e-06, + "loss": 0.0039, + "step": 7415 + }, + { + "epoch": 1.9565618614461802, + "grad_norm": 0.32960009574890137, + "learning_rate": 4.327747723974139e-06, + "loss": 0.0159, + "step": 7420 + }, + { + "epoch": 1.9578801661063872, + "grad_norm": 0.02581116557121277, + "learning_rate": 4.195804195804197e-06, + "loss": 0.003, + "step": 7425 + }, + { + "epoch": 1.9591984707665941, + "grad_norm": 0.01672324910759926, + "learning_rate": 4.063860667634252e-06, + "loss": 0.0233, + "step": 7430 + }, + { + "epoch": 1.9605167754268011, + "grad_norm": 0.021988827735185623, + "learning_rate": 3.93191713946431e-06, + "loss": 0.0029, + "step": 7435 + }, + { + "epoch": 1.9618350800870081, + "grad_norm": 0.27279871702194214, + "learning_rate": 3.799973611294366e-06, + "loss": 0.0023, + "step": 7440 + }, + { + "epoch": 1.9631533847472151, + "grad_norm": 0.002893030410632491, + "learning_rate": 3.6680300831244226e-06, + "loss": 0.0031, + "step": 7445 + }, + { + "epoch": 1.964471689407422, + "grad_norm": 0.024236679077148438, + "learning_rate": 3.53608655495448e-06, + "loss": 0.0519, + "step": 7450 + }, + { + "epoch": 1.965789994067629, + "grad_norm": 0.011010506190359592, + "learning_rate": 3.4041430267845365e-06, + "loss": 0.009, + "step": 7455 + }, + { + "epoch": 1.9671082987278359, + "grad_norm": 0.11187774688005447, + "learning_rate": 3.2721994986145926e-06, + "loss": 0.0197, + "step": 7460 + }, + { + "epoch": 1.968426603388043, + "grad_norm": 0.09504564106464386, + "learning_rate": 3.14025597044465e-06, + "loss": 0.0261, + "step": 7465 + }, + { + "epoch": 1.9697449080482499, + "grad_norm": 0.020677559077739716, + "learning_rate": 3.008312442274707e-06, + "loss": 0.0232, + "step": 7470 + }, + { + "epoch": 1.971063212708457, + "grad_norm": 0.020134272053837776, + "learning_rate": 2.8763689141047634e-06, + "loss": 0.0208, + "step": 7475 + }, + { + "epoch": 1.9723815173686639, + "grad_norm": 0.07275384664535522, + "learning_rate": 2.74442538593482e-06, + "loss": 0.0223, + "step": 7480 + }, + { + "epoch": 1.9736998220288708, + "grad_norm": 0.001021620468236506, + "learning_rate": 2.612481857764877e-06, + "loss": 0.003, + "step": 7485 + }, + { + "epoch": 1.9750181266890778, + "grad_norm": 0.011956339702010155, + "learning_rate": 2.4805383295949337e-06, + "loss": 0.0083, + "step": 7490 + }, + { + "epoch": 1.9763364313492848, + "grad_norm": 0.017296286299824715, + "learning_rate": 2.3485948014249902e-06, + "loss": 0.0431, + "step": 7495 + }, + { + "epoch": 1.9776547360094918, + "grad_norm": 0.0122813880443573, + "learning_rate": 2.2166512732550468e-06, + "loss": 0.0146, + "step": 7500 + }, + { + "epoch": 1.9776547360094918, + "eval_loss": 0.02133146859705448, + "eval_runtime": 451.6814, + "eval_samples_per_second": 7.465, + "eval_steps_per_second": 3.733, + "step": 7500 + }, + { + "epoch": 1.9789730406696988, + "grad_norm": 0.08341170847415924, + "learning_rate": 2.0847077450851037e-06, + "loss": 0.0141, + "step": 7505 + }, + { + "epoch": 1.9802913453299058, + "grad_norm": 0.013888856396079063, + "learning_rate": 1.9527642169151606e-06, + "loss": 0.0162, + "step": 7510 + }, + { + "epoch": 1.9816096499901126, + "grad_norm": 0.010684848763048649, + "learning_rate": 1.8208206887452173e-06, + "loss": 0.0023, + "step": 7515 + }, + { + "epoch": 1.9829279546503198, + "grad_norm": 0.01800103485584259, + "learning_rate": 1.6888771605752736e-06, + "loss": 0.0029, + "step": 7520 + }, + { + "epoch": 1.9842462593105266, + "grad_norm": 0.03569122776389122, + "learning_rate": 1.5569336324053306e-06, + "loss": 0.0024, + "step": 7525 + }, + { + "epoch": 1.9855645639707338, + "grad_norm": 0.005338775459676981, + "learning_rate": 1.4249901042353873e-06, + "loss": 0.0305, + "step": 7530 + }, + { + "epoch": 1.9868828686309405, + "grad_norm": 0.004656268749386072, + "learning_rate": 1.293046576065444e-06, + "loss": 0.0184, + "step": 7535 + }, + { + "epoch": 1.9882011732911475, + "grad_norm": 0.018160656094551086, + "learning_rate": 1.1611030478955007e-06, + "loss": 0.0217, + "step": 7540 + }, + { + "epoch": 1.9895194779513545, + "grad_norm": 0.01999780908226967, + "learning_rate": 1.0291595197255577e-06, + "loss": 0.0412, + "step": 7545 + }, + { + "epoch": 1.9908377826115615, + "grad_norm": 0.010717427358031273, + "learning_rate": 8.972159915556142e-07, + "loss": 0.0107, + "step": 7550 + }, + { + "epoch": 1.9921560872717685, + "grad_norm": 0.018457483500242233, + "learning_rate": 7.652724633856709e-07, + "loss": 0.003, + "step": 7555 + }, + { + "epoch": 1.9934743919319755, + "grad_norm": 0.02118489518761635, + "learning_rate": 6.333289352157277e-07, + "loss": 0.0045, + "step": 7560 + }, + { + "epoch": 1.9947926965921825, + "grad_norm": 0.012181553058326244, + "learning_rate": 5.013854070457844e-07, + "loss": 0.0035, + "step": 7565 + }, + { + "epoch": 1.9961110012523893, + "grad_norm": 0.018975401297211647, + "learning_rate": 3.6944187887584116e-07, + "loss": 0.0236, + "step": 7570 + }, + { + "epoch": 1.9974293059125965, + "grad_norm": 0.46280673146247864, + "learning_rate": 2.3749835070589788e-07, + "loss": 0.0428, + "step": 7575 + }, + { + "epoch": 1.9987476105728033, + "grad_norm": 0.02338407374918461, + "learning_rate": 1.055548225359546e-07, + "loss": 0.0052, + "step": 7580 + } + ], + "logging_steps": 5, + "max_steps": 7584, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.120985601665413e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-7584/training_args.bin b/checkpoint-7584/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4c05e0585b1aef03c0cbe4c50207ce04940bd838 --- /dev/null +++ b/checkpoint-7584/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79dfa687fdd0c9908ab6b63535817e7567b29b0b483ac228723218f6f5fdeec5 +size 5688